/* esl_sq : a single biological sequence */ #ifndef eslSQ_INCLUDED #define eslSQ_INCLUDED #include #include "esl_alphabet.h" #include "esl_msa.h" #include "esl_random.h" /* random, randomseq add ability to sample random sq objects for unit tests */ #include "esl_randomseq.h" /* random, randomseq add ability to sample random sq objects for unit tests */ /* ESL_SQ - a biosequence * * Can be either in text mode , or digital mode . * One of them has to be NULL, and the other contains the data. * * When in text mode, and can hold up to * residues and a terminal '\0', and both are indexed <0..n-1>. * * When in digital mode, and can hold up to * residues; both are indexed <1..n>, and positions 0 and n+1 are * sentinel bytes. The digital sequence uses * as its sentinels; as a hack, uses '\0' as sentinels. This * means that ss+1> is a valid NUL-terminated C string, but * ss> itself would be a string of length 0 because of the * leading NUL sentinel. Watch out for this. * * To save on allocation calls, the structure is designed to be reused * for subsequent sequences, rather than free'd and reallocated - * thus, we keep track of the allocated sizes of all the strings. * * Notes on when we need to reallocate: * - In a text mode sequence (seq 0..n-1), byte salloc-1 is * reserved for the NUL, so the sequence is full when * n == salloc-1. * * - In a digital mode sequence (dsq 1..n), bytes 0 and salloc-1 * are reserved for sentinel bytes, so the reallocation condition * is when n == salloc-2. * * At least for now, the only way to set the structure annotation * field is by a CreateFrom(), by extraction from an MSA, or manually * (by directly allocating a sequence's field). * * A sequence object will usually be holding a complete (full length) * sequence. Three other cases arise less frequently: * * 1. We're a subsequence extracted from a source sequence. * is the name of the source. * is the length of the source (and coords are 1..L). * The subsequence is from .. on the source. * The length of the subsequence is abs(-)+1. * can be greater than for a nucleic acid sequence; * in this case, the subsequence is reverse complemented. * * 2. We're a window on a source sequence. * This is similar to being a subsequence, with the added * wrinkle that we're scanning over a long source sequence * in overlapping segments, defined by a "previous context" * and a "new window" (the whole sequence is n=C+W * residues long): * s C W e * current window: |------||------------| * next window read: |------||------------| * s C W e * Here, dsq[1..n] is source[s..e]; each newly read * window starts at dsq[C+1], and is preceded by C * residues of context. * * 3. We're just after information about the sequence, not the * sequence itself; everything except the per-residue information * (such as and ). We do this when SSI indexing, * for example, so we don't have to read entire huge seqs into * memory just to calculate their lengths for the index. * * Note/TODO: use of "\0" empty string to indicate lack of optional * acc, desc info is now deprecated. Cannot distinguish empty string * from lack of annotation. Should use NULL ptr instead. Fix this in * future. (21 Nov 09 xref J5/114) * * Easel dsq coordinates are "1-offset, closed-interval, no-flag". * 1-offset means 1..n, as opposed to 0..n-1. Closed-interval means * that start/end coords i..j refers to subseq x_i..x_j, not * x_i..x_j-1 (as in half-open coords of Python lists, for example). * No-flag means that we don't keep a flag for forward vs. reverse * strand on DNA/RNA sequences; instead, start>end means reverse * strand. A flaw in such a coord system is that we cannot * unambiguously represent fwd/rev strand for a subsequence of length * 1, where start=end. So in the case of start=end, Easel always * assumes fwd strand; if rev strand is needed, there will be some * sort of patchy workaround. */ typedef struct { /*::cexcerpt::sq_sq::begin::*/ char *name; /* name; one word, no whitespace ("\0" if no name) */ char *acc; /* optional accession (1 word) ("\0" if none) */ char *desc; /* description line ("\0" if no description) */ int32_t tax_id; /* NCBI taxonomy id (-1 if none) */ char *seq; /* sequence [0..n-1], or NULL if digital */ ESL_DSQ *dsq; /* digitized sequence [1..n], or NULL if text */ char *ss; /* optional sec structure [0..n-1], [1..n], or NULL */ int64_t n; /* length of seq (or dsq) and ss */ /*::cexcerpt::sq_sq::end::*/ /* Source-tracking coordinate info for: seq subseq window info */ /* ---- ------ ------ ----- */ int64_t start; /* coord of seq[0],dsq[1] on source [1..L] 1 1<=i<=L 1<=i<=L 0 */ int64_t end; /* coord of seq[n-1],dsq[n] on source[1..L] L 1<=j<=L 1<=j<=L 0 */ int64_t C; /* # of context residues for a window 0 0 n-W 0 */ int64_t W; /* window width L n n-C 0 */ int64_t L; /* source sequence length in residues L L (or -1) L (or -1) L */ /* and n: length of seq (or dsq) and ss actually stored: L abs(j-i)+1 C+W 0 */ /* In all the above bookkeeping, a -1 means "unknown" */ char *source; /* name of the source of a subseq/window; or MSA name; or ""*/ /* Memory allocation bookkeeping: (all inclusive of \0; >= strlen()+1) */ int nalloc; /* allocated length of name */ int aalloc; /* allocated length of accession */ int dalloc; /* allocated length of description */ int64_t salloc; /* alloc for seq or dsq, and ss if present */ int srcalloc; /* allocated length for source name */ /* Disk-tracking offset bookkeeping: */ int64_t idx; /* ctr for which # seq this is; -1 if not counting */ off_t roff; /* record offset (start of record); -1 if none */ off_t hoff; /* offset to last byte of header; -1 if unknown */ off_t doff; /* data offset (start of sequence data); -1 if none */ off_t eoff; /* offset to last byte of record; -1 if unknown */ /* Optional information for extra residue markups. * The number of them, and their tags are arbitrary */ char **xr_tag; /* markup tags for extra residue markups [0..ntr-1][free-text], [0..ntr-1][free-text], or NULL */ char **xr; /* annotations for extra residue markups [0..ntr-1][0..n-1], [0..ntr-1][1..n], or NULL */ int nxr; /* number of extra residue markups */ /* Copy of a pointer to the alphabet, if digital mode */ const ESL_ALPHABET *abc; /* reference to the alphabet for */ } ESL_SQ; typedef struct { int count; /* number of objects in the block */ int listSize; /* maximum number elements in the list */ int complete; /*TRUE if the the final ESL_SQ element on the block is complete, FALSE if it's only a partial winow of the full sequence*/ int64_t first_seqidx;/*unique identifier of the first ESL_SQ object on list; the seqidx of the i'th entry on list is first_seqidx+i */ ESL_SQ *list; /* array of objects */ } ESL_SQ_BLOCK; /* These control default initial allocation sizes in an ESL_SQ. */ #define eslSQ_NAMECHUNK 32 // allocation unit for name, source #define eslSQ_ACCCHUNK 32 // allocation unit for accession #define eslSQ_DESCCHUNK 128 // allocation unit for description #define eslSQ_SEQCHUNK 256 // allocation unit for seqs // .. dsqdata assumes _SEQCHUNK >= 4 extern ESL_SQ *esl_sq_Create(void); extern ESL_SQ *esl_sq_CreateFrom(const char *name, const char *seq, const char *desc, const char *acc, const char *ss); extern int esl_sq_Grow (ESL_SQ *sq, int64_t *ret_nsafe); extern int esl_sq_GrowTo(ESL_SQ *sq, int64_t n); extern int esl_sq_Copy(const ESL_SQ *src, ESL_SQ *dst); extern int esl_sq_Compare (ESL_SQ *sq1, ESL_SQ *sq2); extern int esl_sq_Reuse (ESL_SQ *sq); extern int esl_sq_IsDigital(const ESL_SQ *sq); extern int esl_sq_IsText (const ESL_SQ *sq); extern void esl_sq_Destroy (ESL_SQ *sq); extern int esl_sq_SetName (ESL_SQ *sq, const char *name); extern int esl_sq_SetAccession (ESL_SQ *sq, const char *acc); extern int esl_sq_SetDesc (ESL_SQ *sq, const char *desc); extern int esl_sq_SetSource (ESL_SQ *sq, const char *source); extern int esl_sq_FormatName (ESL_SQ *sq, const char *name, ...) ESL_ATTRIBUTE_FORMAT(printf, 2, 3); extern int esl_sq_FormatAccession(ESL_SQ *sq, const char *acc, ...) ESL_ATTRIBUTE_FORMAT(printf, 2, 3); extern int esl_sq_FormatDesc (ESL_SQ *sq, const char *desc, ...) ESL_ATTRIBUTE_FORMAT(printf, 2, 3); extern int esl_sq_FormatSource (ESL_SQ *sq, const char *source, ...) ESL_ATTRIBUTE_FORMAT(printf, 2, 3); extern int esl_sq_AppendDesc (ESL_SQ *sq, const char *desc); extern int esl_sq_SetCoordComplete(ESL_SQ *sq, int64_t L); extern int esl_sq_CAddResidue (ESL_SQ *sq, char c); extern int esl_sq_ReverseComplement(ESL_SQ *sq); extern int esl_sq_Checksum(const ESL_SQ *sq, uint32_t *ret_checksum); extern int esl_sq_CountResidues(const ESL_SQ *sq, int start, int L, float *f); extern ESL_SQ *esl_sq_CreateDigital(const ESL_ALPHABET *abc); extern ESL_SQ *esl_sq_CreateDigitalFrom(const ESL_ALPHABET *abc, const char *name, const ESL_DSQ *dsq, int64_t L, const char *desc, const char *acc, const char *ss); extern int esl_sq_Digitize(const ESL_ALPHABET *abc, ESL_SQ *sq); extern int esl_sq_Textize(ESL_SQ *sq); extern int esl_sq_GuessAlphabet(ESL_SQ *sq, int *ret_type); extern int esl_sq_XAddResidue(ESL_SQ *sq, ESL_DSQ x); extern int esl_sq_ConvertDegen2X(ESL_SQ *sq); extern int esl_sq_GetFromMSA (const ESL_MSA *msa, int which, ESL_SQ *sq); extern int esl_sq_FetchFromMSA(const ESL_MSA *msa, int which, ESL_SQ **ret_sq); extern ESL_SQ_BLOCK *esl_sq_CreateBlock(int count); extern int esl_sq_BlockGrowTo(ESL_SQ_BLOCK *sqblock, int newsize, int do_digital, const ESL_ALPHABET *abc); extern ESL_SQ_BLOCK *esl_sq_CreateDigitalBlock(int count, const ESL_ALPHABET *abc); extern void esl_sq_DestroyBlock(ESL_SQ_BLOCK *sqBlock); extern int esl_sq_BlockReallocSequences(ESL_SQ_BLOCK *block); extern int esl_sq_Validate(ESL_SQ *sq, char *errmsg); extern int esl_sq_Sample(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, int maxL, ESL_SQ **ret_sq); #endif /*eslSQ_INCLUDED*/