/* Digital representation of biosequence symbols in Easel. */ #ifndef eslALPHABET_INCLUDED #define eslALPHABET_INCLUDED #include "esl_config.h" #include /* isascii() */ #include "easel.h" #ifdef __cplusplus // magic to make C++ compilers happy extern "C" { #endif /* Flags for alphabet types. * Do not change, only add, because these codes are used in file formats. */ #define eslUNKNOWN 0 /* 0=unknown is easel-wide convention; don't change */ #define eslRNA 1 #define eslDNA 2 #define eslAMINO 3 #define eslCOINS 4 /* for toy examples */ #define eslDICE 5 /* also for toy examples */ #define eslNONSTANDARD 6 /* ... if you add here, change esl_abc_ValidateType() too. */ /* Structure: ESL_ALPHABET */ typedef struct { int type; /* eslDNA, eslRNA, eslAMINO, eslNONSTANDARD, etc. */ int K; /* uniq alphabet size: 4 or 20 */ int Kp; /* total size: alphabet + degen + gap + missing */ char *sym; /* "ACGT-RYMKSWHBVDN*~", for instance [0..Kp-1] */ ESL_DSQ inmap[128]; /* inmap['A'] = 0, etc: dsq[] index for a symbol */ char **degen; /* 1/0, which syms inc which res [0..Kp-1][0..K-1] */ int *ndegen; /* # of degenerate residues per code [0..Kp-1] */ ESL_DSQ *complement; /* maps sym to complements, [0..Kp-1]; NULL if not DNA/RNA */ } ESL_ALPHABET; /* 1. An ESL_ALPHABET object. */ extern ESL_ALPHABET *esl_alphabet_Create(int type); extern ESL_ALPHABET *esl_alphabet_CreateCustom(const char *alphabet, int K, int Kp); extern int esl_alphabet_SetEquiv(ESL_ALPHABET *a, char sym, char c); extern int esl_alphabet_SetCaseInsensitive(ESL_ALPHABET *a); extern int esl_alphabet_SetDegeneracy(ESL_ALPHABET *a, char c, char *ds); extern int esl_alphabet_SetIgnored(ESL_ALPHABET *a, const char *ignoredchars); extern size_t esl_alphabet_Sizeof(ESL_ALPHABET *a); extern void esl_alphabet_Destroy(ESL_ALPHABET *a); /* 2. Digitized sequences. */ extern int esl_abc_CreateDsq(const ESL_ALPHABET *a, const char *seq, ESL_DSQ **ret_dsq); extern int esl_abc_Digitize (const ESL_ALPHABET *a, const char *seq, ESL_DSQ *dsq); extern int esl_abc_Textize (const ESL_ALPHABET *a, const ESL_DSQ *dsq, int64_t L, char *seq); extern int esl_abc_TextizeN (const ESL_ALPHABET *a, const ESL_DSQ *dptr, int64_t L, char *buf); extern int esl_abc_dsqcpy(const ESL_DSQ *dsq, int64_t L, ESL_DSQ *dcopy); extern int esl_abc_dsqdup(const ESL_DSQ *dsq, int64_t L, ESL_DSQ **ret_dup); extern int esl_abc_dsqcat (const ESL_DSQ *inmap, ESL_DSQ **dsq, int64_t *L, const char *s, esl_pos_t n); extern int esl_abc_dsqcat_noalloc(const ESL_DSQ *inmap, ESL_DSQ *dsq, int64_t *L, const char *s, esl_pos_t n); extern int64_t esl_abc_dsqlen(const ESL_DSQ *dsq); extern int64_t esl_abc_dsqrlen(const ESL_ALPHABET *a, const ESL_DSQ *dsq); extern int esl_abc_CDealign(const ESL_ALPHABET *abc, char *s, const ESL_DSQ *ref_ax, int64_t *opt_rlen); extern int esl_abc_XDealign(const ESL_ALPHABET *abc, ESL_DSQ *x, const ESL_DSQ *ref_ax, int64_t *opt_rlen); extern int esl_abc_ConvertDegen2X(const ESL_ALPHABET *abc, ESL_DSQ *dsq); extern int esl_abc_revcomp(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int n); /* 3. Other routines in the API. */ extern int esl_abc_ValidateType(int type); extern int esl_abc_GuessAlphabet(const int64_t *ct, int *ret_type); extern double esl_abc_Match (const ESL_ALPHABET *a, ESL_DSQ x, ESL_DSQ y, double *p); extern int esl_abc_IAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const int *sc); extern float esl_abc_FAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const float *sc); extern double esl_abc_DAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const double *sc); extern int esl_abc_IExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc, const float *p); extern float esl_abc_FExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc, const float *p); extern double esl_abc_DExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc, const double *p); extern int esl_abc_IAvgScVec (const ESL_ALPHABET *a, int *sc); extern int esl_abc_FAvgScVec (const ESL_ALPHABET *a, float *sc); extern int esl_abc_DAvgScVec (const ESL_ALPHABET *a, double *sc); extern int esl_abc_IExpectScVec(const ESL_ALPHABET *a, int *sc, const float *p); extern int esl_abc_FExpectScVec(const ESL_ALPHABET *a, float *sc, const float *p); extern int esl_abc_DExpectScVec(const ESL_ALPHABET *a, double *sc, const double *p); extern int esl_abc_FCount (const ESL_ALPHABET *a, float *ct, ESL_DSQ x, float wt); extern int esl_abc_DCount (const ESL_ALPHABET *a, double *ct, ESL_DSQ x, double wt); extern int esl_abc_EncodeType (char *typestring); extern int esl_abc_EncodeTypeMem(char *type, int n); extern char *esl_abc_DecodeType (int type); extern int esl_abc_ValidateSeq(const ESL_ALPHABET *a, const char *seq, int64_t L, char *errbuf); /* In the tests below, remember the rules of order in internal alphabets: * Canonical alphabet Gap Degeneracies Any None Missing * 0..K-1 K K+1..Kp-4 (Kp-3) (Kp-2) (Kp-1) * ACGT - RYMKSWHBVD N * ~ DNA: K=4 Kp=18 * ACDEFGHIKLMNPQRSTVWY - BJZOU X * ~ protein: K=20 Kp=29 * * ESL_DSQ is an unsigned 8-bit type, so don't test for >= 0 or compilers will complain. */ #define esl_abc_DigitizeSymbol(a, c) ((a)->inmap[(int)c]) #define esl_abc_XIsValid(a, x) ((x) < (a)->Kp) #define esl_abc_XIsResidue(a, x) ((x) < (a)->K || ((x) > (a)->K && (x) < (a)->Kp-2)) #define esl_abc_XIsCanonical(a, x) ((x) < (a)->K) #define esl_abc_XIsGap(a, x) ((x) == (a)->K) #define esl_abc_XIsDegenerate(a, x) ((x) > (a)->K && (x) < (a)->Kp-2) #define esl_abc_XIsUnknown(a, x) ((x) == (a)->Kp-3) #define esl_abc_XIsNonresidue(a, x) ((x) == (a)->Kp-2) #define esl_abc_XIsMissing(a, x) ((x) == (a)->Kp-1) #define esl_abc_XGetGap(a) ((a)->K) #define esl_abc_XGetUnknown(a) ((a)->Kp-3) #define esl_abc_XGetNonresidue(a) ((a)->Kp-2) #define esl_abc_XGetMissing(a) ((a)->Kp-1) #define esl_abc_CIsValid(a, c) (isascii(c) && (a)->inmap[(int)c] < (a)->Kp) #define esl_abc_CIsResidue(a, c) ((a)->inmap[(int)c] < (a)->K || ((a)->inmap[(int)c] > (a)->K && (a)->inmap[(int)c] < (a)->Kp-2)) #define esl_abc_CIsCanonical(a, c) ((a)->inmap[(int)c] < (a)->K) #define esl_abc_CIsGap(a, c) ((a)->inmap[(int)c] == (a)->K) #define esl_abc_CIsDegenerate(a, c) ((a)->inmap[(int)c] > (a)->K && (a)->inmap[(int)c] < (a)->Kp-2) #define esl_abc_CIsUnknown(a, c) ((a)->inmap[(int)c] == (a)->Kp-3) #define esl_abc_CIsNonresidue(a, c) ((a)->inmap[(int)c] == (a)->Kp-2) #define esl_abc_CIsMissing(a, c) ((a)->inmap[(int)c] == (a)->Kp-1) #define esl_abc_CGetGap(a) ((a)->sym[(a)->K]) #define esl_abc_CGetUnknown(a) ((a)->sym[(a)->Kp-3]) #define esl_abc_CGetNonresidue(a) ((a)->sym[(a)->Kp-2]) #define esl_abc_CGetMissing(a) ((a)->sym[(a)->Kp-1]) #ifdef __cplusplus // magic to make C++ compilers happy } #endif #endif /*eslALPHABET_INCLUDED*/