/* Multiple sequence alignments. * * Contents: * 1. The object * 2. Digital mode MSA's * 3. Setting, checking data fields in an * 4. Miscellaneous functions for manipulating MSAs * 5. Debugging, testing, development * 6. Unit tests * 7. Test driver */ #include "esl_config.h" #include #include #include #include #include #ifdef HAVE_STRINGS_H #include /* POSIX strcasecmp() */ #endif #include "easel.h" #include "esl_alphabet.h" #include "esl_bitfield.h" #include "esl_arr2.h" #include "esl_arr3.h" #include "esl_keyhash.h" #include "esl_mem.h" #include "esl_random.h" #include "esl_randomseq.h" #include "esl_ssi.h" #include "esl_vectorops.h" #include "esl_wuss.h" #include "esl_msa.h" /****************************************************************************** *# 1. The object *****************************************************************************/ static ESL_MSA *msa_create_mostly(int nseq, int64_t alen); /* Function: esl_msa_Create() * Synopsis: Creates an object. * * Purpose: Creates and initializes an object, and returns a * pointer to it. * * If caller already knows the dimensions of the alignment, * both and , then allocates the whole thing at once. The MSA's * and fields are set accordingly, and the * caller doesn't have to worry about setting them; it can * just fill in . * * If caller doesn't know the dimensions of the alignment * (for example, when parsing an alignment file), then * is taken to be an initial allocation size, and * must be -1. is used as a flag for a * "growable" MSA. For example, the call . allocates internally for an * initial block of 16 sequences, but without allocating * any space for individual sequences. This allocation can * be expanded (by doubling) by calling . * A created can only be <_Expand()>'ed if is * -1. * * In a growable alignment, caller becomes responsible for * memory allocation of each individual . Caller * is also responsible for setting and when * it is done parsing and creating the new MSA. In * particular, the function relies on * to know how many individual sequences are * allocated. * * Args: - number of sequences, or nseq allocation blocksize * - length of alignment in columns, or -1 * * Returns: pointer to new MSA object, w/ all values initialized. * * Throws: on allocation failure. */ ESL_MSA * esl_msa_Create(int nseq, int64_t alen) { int status; ESL_MSA *msa; int i; ESL_DASSERT1(( nseq > 0 )); ESL_DASSERT1(( alen >= -1)); msa = msa_create_mostly(nseq, alen); /* aseq is null upon successful return */ if (msa == NULL) return NULL; /* already threw error in msa_create_mostly, so percolate */ ESL_ALLOC(msa->aseq, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->aseq[i] = NULL; if (alen != -1) { for (i = 0; i < nseq; i++) { ESL_ALLOC(msa->aseq[i], sizeof(char) * (alen+1)); msa->aseq[i][alen] = '\0'; /* caller might forget to null terminate; help the poor */ } msa->nseq = nseq; } return msa; ERROR: esl_msa_Destroy(msa); return NULL; } /* Function: esl_msa_Expand() * Synopsis: Reallocate for more sequences. * * Purpose: Double the current sequence allocation in . * Typically used when we're reading an alignment sequentially * from a file, so we don't know nseq 'til we're done. * * Returns: on success. * * Throws: on reallocation failure; is undamaged, * and the caller may attempt to recover from the error. * * Throws if is not growable: its * field must be -1 to be growable. * * Xref: squid's MSAExpand(), 1999. */ int esl_msa_Expand(ESL_MSA *msa) { int status; int old, new_size; /* old & new allocation sizes (max # seqs) */ int i,j; if (msa->alen != -1) ESL_EXCEPTION(eslEINVAL, "that MSA is not growable"); old = msa->sqalloc; new_size = 2*old; /* Normally either aseq (ascii) or ax (digitized) would be active, not both. * We could make sure that that's true, but that's checked elsewhere. */ if (msa->aseq) ESL_REALLOC(msa->aseq, sizeof(char *) * new_size); if (msa->ax) ESL_REALLOC(msa->ax, sizeof(ESL_DSQ *) * new_size); ESL_REALLOC(msa->sqname, sizeof(char *) * new_size); ESL_REALLOC(msa->wgt, sizeof(double) * new_size); ESL_REALLOC(msa->sqlen, sizeof(int64_t)* new_size); if (msa->ss) { ESL_REALLOC(msa->ss, sizeof(char *) * new_size); ESL_REALLOC(msa->sslen, sizeof(int64_t) * new_size); } if (msa->sa) { ESL_REALLOC(msa->sa, sizeof(char *) * new_size); ESL_REALLOC(msa->salen, sizeof(int64_t) * new_size); } if (msa->pp) { ESL_REALLOC(msa->pp, sizeof(char *) * new_size); ESL_REALLOC(msa->pplen, sizeof(int64_t) * new_size); } if (msa->sqacc) ESL_REALLOC(msa->sqacc, sizeof(char *) * new_size); if (msa->sqdesc) ESL_REALLOC(msa->sqdesc, sizeof(char *) * new_size); for (i = old; i < new_size; i++) { if (msa->aseq) msa->aseq[i] = NULL; if (msa->ax) msa->ax[i] = NULL; msa->sqname[i] = NULL; msa->wgt[i] = -1.0; /* -1.0 means "unset so far" */ msa->sqlen[i] = 0; if (msa->ss) { msa->ss[i] = NULL; msa->sslen[i] = 0; } if (msa->sa) { msa->sa[i] = NULL; msa->salen[i] = 0; } if (msa->pp) { msa->pp[i] = NULL; msa->pplen[i] = 0; } if (msa->sqacc) msa->sqacc[i] = NULL; if (msa->sqdesc) msa->sqdesc[i] = NULL; } /* Reallocate and re-init for unparsed #=GS tags, if we have some. * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle * set of pointers. */ if (msa->gs) for (i = 0; i < msa->ngs; i++) { if (msa->gs[i]) { ESL_REALLOC(msa->gs[i], sizeof(char *) * new_size); for (j = old; j < new_size; j++) msa->gs[i][j] = NULL; } } /* Reallocate and re-init for unparsed #=GR tags, if we have some. * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle * set of pointers. */ if (msa->gr) for (i = 0; i < msa->ngr; i++) { if (msa->gr[i]) { ESL_REALLOC(msa->gr[i], sizeof(char *) * new_size); for (j = old; j < new_size; j++) msa->gr[i][j] = NULL; } } msa->sqalloc = new_size; return eslOK; ERROR: return status; } /* Function: esl_msa_Copy() * Synopsis: Copies an MSA. * * Purpose: Makes a copy of in . Caller has * already allocated to hold an MSA of * at least nseq> sequences and alen> * columns. * * Note: Because MSA's are not reusable, this function does a * lot of internal allocation for optional fields, without * checking to see if space was already allocated. To * reuse an MSA and copy new data into it, we'll * eventually need a function, and/or * recode this to reuse or free any already-allocated * optional memory it encounters in . Until then, * it's unlikely that is useful on its own; * the caller would be expected to call * instead. * * Returns: on success. * * Throws: on allocation failure. In this case, * was only partially constructed, and should be treated * as corrupt. */ int esl_msa_Copy(const ESL_MSA *msa, ESL_MSA *new_msa) { int i, x, j; int status; /* aseq[0..nseq-1][0..alen-1] strings, * or ax[0..nseq-1][(0) 1..alen (alen+1)] digital seqs * must have one of them allocated already. */ if (! (msa->flags & eslMSA_DIGITAL)) for (i = 0; i < msa->nseq; i++) strcpy(new_msa->aseq[i], msa->aseq[i]); else { for (i = 0; i < msa->nseq; i++) memcpy(new_msa->ax[i], msa->ax[i], (msa->alen+2) * sizeof(ESL_DSQ)); new_msa->abc = msa->abc; } for (i = 0; i < msa->nseq; i++) { esl_strdup(msa->sqname[i], -1, &(new_msa->sqname[i])); new_msa->wgt[i] = msa->wgt[i]; } /* alen, nseq were already set by Create() */ new_msa->flags = msa->flags; esl_strdup(msa->name, -1, &(new_msa->name)); esl_strdup(msa->desc, -1, &(new_msa->desc)); esl_strdup(msa->acc, -1, &(new_msa->acc)); esl_strdup(msa->au, -1, &(new_msa->au)); esl_strdup(msa->ss_cons, -1, &(new_msa->ss_cons)); esl_strdup(msa->sa_cons, -1, &(new_msa->sa_cons)); esl_strdup(msa->pp_cons, -1, &(new_msa->pp_cons)); esl_strdup(msa->rf, -1, &(new_msa->rf)); esl_strdup(msa->mm, -1, &(new_msa->mm)); if (msa->sqacc != NULL) { ESL_ALLOC(new_msa->sqacc, sizeof(char *) * new_msa->sqalloc); for (i = 0; i < msa->nseq; i++) esl_strdup(msa->sqacc[i], -1, &(new_msa->sqacc[i])); for ( ; i < new_msa->sqalloc; i++) new_msa->sqacc[i] = NULL; } if (msa->sqdesc != NULL) { ESL_ALLOC(new_msa->sqdesc, sizeof(char *) * new_msa->sqalloc); for (i = 0; i < msa->nseq; i++) esl_strdup(msa->sqdesc[i], -1, &(new_msa->sqdesc[i])); for ( ; i < new_msa->sqalloc; i++) new_msa->sqdesc[i] = NULL; } if (msa->ss != NULL) { ESL_ALLOC(new_msa->ss, sizeof(char *) * new_msa->sqalloc); for (i = 0; i < msa->nseq; i++) esl_strdup(msa->ss[i], -1, &(new_msa->ss[i])); for ( ; i < new_msa->sqalloc; i++) new_msa->ss[i] = NULL; } if (msa->sa != NULL) { ESL_ALLOC(new_msa->sa, sizeof(char *) * msa->nseq); for (i = 0; i < msa->nseq; i++) esl_strdup(msa->sa[i], -1, &(new_msa->sa[i])); for ( ; i < new_msa->sqalloc; i++) new_msa->sa[i] = NULL; } if (msa->pp != NULL) { ESL_ALLOC(new_msa->pp, sizeof(char *) * msa->nseq); for (i = 0; i < msa->nseq; i++) esl_strdup(msa->pp[i], -1, &(new_msa->pp[i])); for ( ; i < new_msa->sqalloc; i++) new_msa->pp[i] = NULL; } for (x = 0; x < eslMSA_NCUTS; x++) { new_msa->cutoff[x] = msa->cutoff[x]; new_msa->cutset[x] = msa->cutset[x]; } if (msa->ncomment > 0) { ESL_ALLOC(new_msa->comment, sizeof(char *) * msa->ncomment); new_msa->ncomment = msa->ncomment; new_msa->alloc_ncomment = msa->ncomment; for (i = 0; i < msa->ncomment; i++) esl_strdup(msa->comment[i], -1, &(new_msa->comment[i])); } if (msa->ngf > 0) { ESL_ALLOC(new_msa->gf_tag, sizeof(char *) * msa->ngf); ESL_ALLOC(new_msa->gf, sizeof(char *) * msa->ngf); new_msa->ngf = msa->ngf; new_msa->alloc_ngf = msa->ngf; for (i = 0; i < msa->ngf; i++) { esl_strdup(msa->gf_tag[i], -1, &(new_msa->gf_tag[i])); esl_strdup(msa->gf[i], -1, &(new_msa->gf[i])); } } if (msa->ngs > 0) { ESL_ALLOC(new_msa->gs_tag, sizeof(char *) * msa->ngs); ESL_ALLOC(new_msa->gs, sizeof(char **) * msa->ngs); new_msa->ngs = msa->ngs; for (i = 0; i < msa->ngs; i++) { ESL_ALLOC(new_msa->gs[i], sizeof(char *) * msa->nseq); esl_strdup(msa->gs_tag[i], -1, &(new_msa->gs_tag[i])); for (j = 0; j < msa->nseq; j++) esl_strdup(msa->gs[i][j], -1, &(new_msa->gs[i][j])); } } if (msa->ngc > 0) { ESL_ALLOC(new_msa->gc_tag, sizeof(char *) * msa->ngc); ESL_ALLOC(new_msa->gc, sizeof(char *) * msa->ngc); new_msa->ngc = msa->ngc; for (i = 0; i < msa->ngc; i++) { esl_strdup(msa->gc_tag[i], -1, &(new_msa->gc_tag[i])); esl_strdup(msa->gc[i], -1, &(new_msa->gc[i])); } } if (msa->ngr > 0) { ESL_ALLOC(new_msa->gr_tag, sizeof(char *) * msa->ngr); ESL_ALLOC(new_msa->gr, sizeof(char **) * msa->ngr); new_msa->ngr = msa->ngr; for (i = 0; i < msa->ngr; i++) { ESL_ALLOC(new_msa->gr[i], sizeof(char *) * msa->nseq); esl_strdup(msa->gr_tag[i], -1, &(new_msa->gr_tag[i])); for (j = 0; j < msa->nseq; j++) esl_strdup(msa->gr[i][j], -1, &(new_msa->gr[i][j])); } } esl_keyhash_Destroy(new_msa->index); new_msa->index = NULL; esl_keyhash_Destroy(new_msa->gs_idx); new_msa->gs_idx = NULL; esl_keyhash_Destroy(new_msa->gc_idx); new_msa->gc_idx = NULL; esl_keyhash_Destroy(new_msa->gr_idx); new_msa->gr_idx = NULL; if (msa->index != NULL) new_msa->index = esl_keyhash_Clone(msa->index); if (msa->gs_idx != NULL) new_msa->gs_idx = esl_keyhash_Clone(msa->gs_idx); if (msa->gc_idx != NULL) new_msa->gc_idx = esl_keyhash_Clone(msa->gc_idx); if (msa->gr_idx != NULL) new_msa->gr_idx = esl_keyhash_Clone(msa->gr_idx); new_msa->offset = msa->offset; return eslOK; ERROR: return status; } /* Function: esl_msa_Clone() * Synopsis: Duplicates an MSA. * * Purpose: Make a duplicate of , in newly * allocated space. * * Returns: a pointer to the newly allocated clone. * Caller is responsible for free'ing it. * * Throws: on allocation error. */ ESL_MSA * esl_msa_Clone(const ESL_MSA *msa) { ESL_MSA *nw = NULL; int status; if (msa->flags & eslMSA_DIGITAL) { if ((nw = esl_msa_CreateDigital(msa->abc, msa->nseq, msa->alen)) == NULL) return NULL; } else if ((nw = esl_msa_Create(msa->nseq, msa->alen)) == NULL) return NULL; if ((status = esl_msa_Copy(msa, nw) ) != eslOK) goto ERROR; return nw; ERROR: esl_msa_Destroy(nw); return NULL; } /* Function: esl_msa_Sizeof() * Synopsis: Returns approximate size of an ESL_MSA, in bytes * Incept: SRE, Thu Nov 2 11:17:18 2017 * * Purpose: Returns the approximate size of an , in * bytes. Approximate, because it counts used data size * (the size of the alignment) rather than alloced size * (the actual memory required by the structure), * and the structure may be overallocated (e.g. by * .) That is, returns the minimum * size required to store the data. * * (We may want to distinguish between true allocated * size versus minimum size in the future.) */ size_t esl_msa_Sizeof(ESL_MSA *msa) { size_t n = 0; n += sizeof(ESL_MSA); n += esl_arr2_SSizeof(msa->sqname, msa->nseq); n += sizeof(double) * msa->nseq; // wgt if (msa->aseq) n += esl_arr2_SSizeof(msa->aseq, msa->nseq); else if (msa->ax) { n += sizeof(ESL_DSQ *) * msa->nseq; n += sizeof(ESL_DSQ) * msa->nseq * (msa->alen + 2); } if (msa->name) n += sizeof(char) * (1 + strlen(msa->name)); if (msa->desc) n += sizeof(char) * (1 + strlen(msa->desc)); if (msa->acc) n += sizeof(char) * (1 + strlen(msa->acc)); if (msa->au) n += sizeof(char) * (1 + strlen(msa->au)); if (msa->ss_cons) n += sizeof(char) * msa->alen; if (msa->sa_cons) n += sizeof(char) * msa->alen; if (msa->pp_cons) n += sizeof(char) * msa->alen; if (msa->rf) n += sizeof(char) * msa->alen; if (msa->mm) n += sizeof(char) * msa->alen; n += esl_arr2_SSizeof(msa->sqacc, msa->nseq); n += esl_arr2_SSizeof(msa->sqdesc, msa->nseq); n += esl_arr2_SSizeof(msa->ss, msa->nseq); n += esl_arr2_SSizeof(msa->sa, msa->nseq); n += esl_arr2_SSizeof(msa->pp, msa->nseq); n += esl_arr2_SSizeof(msa->comment, msa->ncomment); n += esl_arr2_SSizeof(msa->gf_tag, msa->ngf); n += esl_arr2_SSizeof(msa->gf, msa->ngf); n += esl_arr2_SSizeof(msa->gs_tag, msa->ngs); n += esl_arr3_SSizeof(msa->gs, msa->ngs, msa->nseq); n += esl_arr2_SSizeof(msa->gc_tag, msa->ngc); n += esl_arr2_SSizeof(msa->gc, msa->ngc); n += esl_arr2_SSizeof(msa->gr_tag, msa->ngr); n += esl_arr3_SSizeof(msa->gr, msa->ngr, msa->nseq); n += esl_keyhash_Sizeof(msa->index); n += esl_keyhash_Sizeof(msa->gs_idx); n += esl_keyhash_Sizeof(msa->gc_idx); n += esl_keyhash_Sizeof(msa->gr_idx); return n; } /* Function: esl_msa_Destroy() * Synopsis: Frees an . * * Purpose: Destroys . * * Xref: squid's MSADestroy(). */ void esl_msa_Destroy(ESL_MSA *msa) { if (msa == NULL) return; esl_arr2_Destroy((void **) msa->aseq, msa->nseq); esl_arr2_Destroy((void **) msa->ax, msa->nseq); esl_arr2_Destroy((void **) msa->sqname, msa->nseq); esl_arr2_Destroy((void **) msa->sqacc, msa->nseq); esl_arr2_Destroy((void **) msa->sqdesc, msa->nseq); esl_arr2_Destroy((void **) msa->ss, msa->nseq); esl_arr2_Destroy((void **) msa->sa, msa->nseq); esl_arr2_Destroy((void **) msa->pp, msa->nseq); if (msa->sqlen != NULL) free(msa->sqlen); if (msa->wgt != NULL) free(msa->wgt); if (msa->name != NULL) free(msa->name); if (msa->desc != NULL) free(msa->desc); if (msa->acc != NULL) free(msa->acc); if (msa->au != NULL) free(msa->au); if (msa->ss_cons != NULL) free(msa->ss_cons); if (msa->sa_cons != NULL) free(msa->sa_cons); if (msa->pp_cons != NULL) free(msa->pp_cons); if (msa->rf != NULL) free(msa->rf); if (msa->mm != NULL) free(msa->mm); if (msa->sslen != NULL) free(msa->sslen); if (msa->salen != NULL) free(msa->salen); if (msa->pplen != NULL) free(msa->pplen); esl_arr2_Destroy((void **) msa->comment, msa->ncomment); esl_arr2_Destroy((void **) msa->gf_tag, msa->ngf); esl_arr2_Destroy((void **) msa->gf, msa->ngf); esl_arr2_Destroy((void **) msa->gs_tag, msa->ngs); esl_arr3_Destroy((void ***)msa->gs, msa->ngs, msa->nseq); esl_arr2_Destroy((void **) msa->gc_tag, msa->ngc); esl_arr2_Destroy((void **) msa->gc, msa->ngc); esl_arr2_Destroy((void **) msa->gr_tag, msa->ngr); esl_arr3_Destroy((void ***)msa->gr, msa->ngr, msa->nseq); esl_keyhash_Destroy(msa->index); esl_keyhash_Destroy(msa->gs_idx); esl_keyhash_Destroy(msa->gc_idx); esl_keyhash_Destroy(msa->gr_idx); free(msa); return; } /* msa_create_mostly() * * This is the routine called by esl_msa_Create() and esl_msa_CreateDigital() * that does all allocation except the aseq/ax alignment data. * * may be the exact known # of seqs in an alignment; or * may be an allocation block size (to be expanded by doubling, in * esl_msa_Expand(), as in: * nseq == msa->sqalloc) esl_msa_Expand(msa);> * should not be 0. * * may be the exact length of an alignment, in columns; or it * may be -1, which states that your parser will take responsibility * for expanding as needed as new input is read into a growing new * alignment. * * A created can only be <_Expand()>'ed if is -1. * * Args: - number of sequences, or nseq allocation blocksize * - length of alignment in columns, or -1 * * Returns: pointer to new MSA object, w/ all values initialized. * Note that msa->nseq is initialized to 0 here, even though space * is allocated. * * Throws: on allocation failure. */ static ESL_MSA * msa_create_mostly(int nseq, int64_t alen) { int status; ESL_MSA *msa = NULL; int i; ESL_ALLOC(msa, sizeof(ESL_MSA)); msa->aseq = NULL; msa->sqname = NULL; msa->wgt = NULL; msa->alen = alen; /* if -1, then we're growable. */ msa->nseq = 0; /* our caller (text or digital allocation) sets this. */ msa->flags = 0; msa->abc = NULL; msa->ax = NULL; msa->name = NULL; msa->desc = NULL; msa->acc = NULL; msa->au = NULL; msa->ss_cons = NULL; msa->sa_cons = NULL; msa->pp_cons = NULL; msa->rf = NULL; msa->mm = NULL; msa->sqacc = NULL; msa->sqdesc = NULL; msa->ss = NULL; msa->sa = NULL; msa->pp = NULL; for (i = 0; i < eslMSA_NCUTS; i++) { msa->cutoff[i] = 0.; msa->cutset[i] = FALSE; } msa->sqalloc = nseq; msa->sqlen = NULL; msa->sslen = NULL; msa->salen = NULL; msa->pplen = NULL; msa->lastidx = 0; /* Unparsed markup, including comments and Stockholm tags. */ msa->comment = NULL; msa->ncomment = 0; msa->alloc_ncomment = 0; msa->gf_tag = NULL; msa->gf = NULL; msa->ngf = 0; msa->alloc_ngf = 0; msa->gs_tag = NULL; msa->gs = NULL; msa->ngs = 0; msa->gc_tag = NULL; msa->gc = NULL; msa->ngc = 0; msa->gr_tag = NULL; msa->gr = NULL; msa->ngr = 0; msa->index = esl_keyhash_Create(); msa->gs_idx = NULL; msa->gc_idx = NULL; msa->gr_idx = NULL; msa->offset = 0; /* Allocation, round 2. */ if(nseq > 0) { ESL_ALLOC(msa->sqname, sizeof(char *) * nseq); ESL_ALLOC(msa->wgt, sizeof(double) * nseq); ESL_ALLOC(msa->sqlen, sizeof(int64_t)* nseq); } /* Initialize at the second level. */ for (i = 0; i < nseq; i++) { msa->sqname[i] = NULL; msa->sqlen[i] = 0; msa->wgt[i] = -1.0; /* "unset so far" */ } return msa; ERROR: esl_msa_Destroy(msa); return NULL; } /*------------------- end, ESL_MSA object -----------------------*/ /***************************************************************** *# 2. Digital mode MSA's *****************************************************************/ /* Function: esl_msa_GuessAlphabet() * Synopsis: Guess alphabet of MSA. * * Purpose: Guess whether the sequences in the are * , , or , and return * that guess in <*ret_type>. * * The determination is made based on the classifications * of the individual sequences in the alignment. At least * one sequence must contain ten residues or more to be * classified. If one or more sequences is called * and one or more is called /, * the alignment's alphabet is considered to be * indeterminate (). If some sequences are * and some are , the alignment is called * ; this should cause no problems, because Easel * reads U as a synonym for T in DNA sequence anyway. * * Tested on Pfam 21.0 and Rfam 7.0, this routine correctly * classified all 8957 Pfam alignments as protein, and 503 * Rfam alignments as RNA (both seed and full alignments). * * Returns: on success, and <*ret_type> is set * to , , or . * * Returns and sets <*ret_type> to * if the alphabet cannot be reliably guessed. * * Xref: J1/62 */ int esl_msa_GuessAlphabet(const ESL_MSA *msa, int *ret_type) { int64_t namino = 0, ndna = 0, nrna = 0, nunknown = 0; int type; int i,x; int64_t j,n; int64_t ct[26]; if (msa->flags & eslMSA_DIGITAL) { *ret_type = msa->abc->type; return eslOK; } *ret_type = eslUNKNOWN; /* On wide alignments, we're better off looking at individual sequence * classifications. We don't want to end up calling the whole alignment * indeterminate just because a few sequences have degenerate residue * codes. */ for (i = 0; i < msa->nseq; i++) { for (x = 0; x < 26; x++) ct[x] = 0; for (n = 0, j = 0; j < msa->alen; j++) { x = toupper(msa->aseq[i][j]) - 'A'; if (x < 0 || x > 25) continue; ct[x]++; n++; if (n > 10000) break; /* ought to know by now */ } esl_abc_GuessAlphabet(ct, &type); switch (type) { case eslAMINO: namino++; break; case eslDNA: ndna++; break; case eslRNA: nrna++; break; default: nunknown++; } } if (namino > 0 && (ndna+nrna) == 0) *ret_type = eslAMINO; else if (ndna > 0 && (nrna+namino) == 0) *ret_type = eslDNA; else if (nrna > 0 && (ndna+namino) == 0) *ret_type = eslRNA; else if (ndna+nrna > 0 && namino == 0) *ret_type = eslDNA; /* On narrow alignments, no single sequence may be long enough to * be classified, but we can determine alphabet from composition * of the complete alignment. Of course, degenerate residue codes in * a DNA alignment will still screw us. */ if (*ret_type == eslUNKNOWN) { n = 0; for (x = 0; x < 26; x++) ct[x] = 0; for (i = 0; i < msa->nseq; i++) { for (j = 0; j < msa->alen; j++) { x = toupper(msa->aseq[i][j]) - 'A'; if (x < 0 || x > 26) continue; ct[x]++; n++; if (n > 10000) break; /* ought to know by now */ } if (n > 10000) break; } esl_abc_GuessAlphabet(ct, ret_type); } if (*ret_type == eslUNKNOWN) return eslENOALPHABET; else return eslOK; } /* Function: esl_msa_CreateDigital() * Synopsis: Create a digital . * * Purpose: Same as , except the returned MSA is configured * for a digital alignment using internal alphabet , instead of * a text alignment. * * Internally, this means the field is allocated instead of * the field, and the flag is raised. * * Args: - number of sequences, or nseq allocation blocksize * - length of alignment in columns, or -1 * * Returns: pointer to new MSA object, w/ all values initialized. * Note that nseq> is initialized to 0, even though space * is allocated. * * Throws: NULL on allocation failure. * * Xref: squid's MSAAlloc() */ ESL_MSA * esl_msa_CreateDigital(const ESL_ALPHABET *abc, int nseq, int64_t alen) { int status; ESL_MSA *msa; int i; msa = msa_create_mostly(nseq, alen); /* aseq is null upon successful return */ if (msa == NULL) return NULL; /* already threw error in mostly_create, so percolate */ ESL_ALLOC(msa->ax, sizeof(ESL_DSQ *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->ax[i] = NULL; if (alen != -1) { for (i = 0; i < nseq; i++) { ESL_ALLOC(msa->ax[i], sizeof(ESL_DSQ) * (alen+2)); msa->ax[i][0] = msa->ax[i][alen+1] = eslDSQ_SENTINEL; /* help the poor */ } msa->nseq = nseq; } msa->abc = (ESL_ALPHABET *) abc; /* this cast away from const-ness is deliberate & safe. */ msa->flags |= eslMSA_DIGITAL; return msa; ERROR: esl_msa_Destroy(msa); return NULL; } /* Function: esl_msa_Digitize() * Synopsis: Digitizes an msa, converting it from text mode. * * Purpose: Given an alignment in text mode, convert it to * digital mode, using alphabet . * * Internally, the digital alignment field is filled, * the text alignment field is destroyed and free'd, * a copy of the alphabet pointer is kept in the msa's * reference, and the flag is raised * in . * * Because may be called on * unvalidated user data, may be passed, for * capturing an informative error message. For example, in * reading alignments from files, invalid characters in the * alignment are caught at the digitization step. * * Args: abc - digital alphabet * msa - multiple alignment to digitize * errbuf - optional: error message buffer, or * * Returns: on success; * if one or more sequences contain invalid characters * that can't be digitized. If this happens, the is returned * unaltered - left in text mode, with as it was. (This is * a normal error, because aseq> may be user input that we * haven't validated yet.) * * Throws: on allocation failure; in this case, state of may be * wedged, and it should only be destroyed, not used. */ int esl_msa_Digitize(const ESL_ALPHABET *abc, ESL_MSA *msa, char *errbuf) { char errbuf2[eslERRBUFSIZE]; int i; int status; /* Contract checks */ if (msa->aseq == NULL) ESL_EXCEPTION(eslEINVAL, "msa has no text alignment"); if (msa->ax != NULL) ESL_EXCEPTION(eslEINVAL, "msa already has digital alignment"); if (msa->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "msa is flagged as digital"); /* Validate before we convert. Then we can leave the untouched if * any of the sequences contain invalid characters. */ for (i = 0; i < msa->nseq; i++) if (esl_abc_ValidateSeq(abc, msa->aseq[i], msa->alen, errbuf2) != eslOK) ESL_FAIL(eslEINVAL, errbuf, "%s: %s", msa->sqname[i], errbuf2); /* Convert, sequence-by-sequence, free'ing aseq as we go. */ ESL_ALLOC(msa->ax, msa->sqalloc * sizeof(ESL_DSQ *)); for (i = 0; i < msa->nseq; i++) { ESL_ALLOC(msa->ax[i], (msa->alen+2) * sizeof(ESL_DSQ)); status = esl_abc_Digitize(abc, msa->aseq[i], msa->ax[i]); if (status != eslOK) goto ERROR; free(msa->aseq[i]); } for (; i < msa->sqalloc; i++) msa->ax[i] = NULL; free(msa->aseq); msa->aseq = NULL; msa->abc = (ESL_ALPHABET *) abc; /* convince compiler that removing const-ness is safe */ msa->flags |= eslMSA_DIGITAL; return eslOK; ERROR: return status; } /* Function: esl_msa_Textize() * Synopsis: Convert a digital msa to text mode. * * Purpose: Given an alignment in digital mode, convert it * to text mode. * * Internally, the text alignment field is filled, the * digital alignment field is destroyed and free'd, the * msa's digital alphabet reference is nullified, and * the flag is dropped in . * * Args: msa - multiple alignment to convert to text * * Returns: on success. * * Throws: on allocation failure. * if one or more of the digitized alignment strings * contain invalid characters. */ int esl_msa_Textize(ESL_MSA *msa) { int status; int i; /* Contract checks */ if (msa->ax == NULL) ESL_EXCEPTION(eslEINVAL, "msa has no digital alignment"); if (msa->aseq != NULL) ESL_EXCEPTION(eslEINVAL, "msa already has text alignment"); if (! (msa->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "msa is not flagged as digital"); if (msa->abc == NULL) ESL_EXCEPTION(eslEINVAL, "msa has no digital alphabet"); /* Convert, sequence-by-sequence, free'ing ax as we go. */ ESL_ALLOC(msa->aseq, msa->sqalloc * sizeof(char *)); for (i = 0; i < msa->nseq; i++) { ESL_ALLOC(msa->aseq[i], (msa->alen+1) * sizeof(char)); status = esl_abc_Textize(msa->abc, msa->ax[i], msa->alen, msa->aseq[i]); if (status != eslOK) goto ERROR; free(msa->ax[i]); } for (; i < msa->sqalloc; i++) msa->aseq[i] = NULL; free(msa->ax); msa->ax = NULL; msa->abc = NULL; /* nullify reference (caller still owns real abc) */ msa->flags &= ~eslMSA_DIGITAL; /* drop the flag */ return eslOK; ERROR: return status; } /* Function: esl_msa_ConvertDegen2X() * Synopsis: Convert all degenerate residues to X/N * * Purpose: Convert all the degenerate residue codes in digital * MSA to the code for "unknown residue" (maximum * degeneracy); for example, X for protein, N for * nucleic acid. * * This is handy when you need to be compatible with * software that can't deal with unusual residue codes. * For example, WU-BLAST can't deal with O (pyrrolysine) * codes. * * Returns: on success. * * Throws: if isn't in digital mode. * (We only know how to interpret the alphabet in digital * mode. In text mode, letters are just letters.) */ int esl_msa_ConvertDegen2X(ESL_MSA *msa) { int i; int status; if (! (msa->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "esl_msa_ConvertDegen2X only works on digital sequences"); for (i = 0; i < msa->nseq; i++) if ((status = esl_abc_ConvertDegen2X(msa->abc, msa->ax[i])) != eslOK) return status; return eslOK; } /*---------------------- end of digital MSA functions -----------------------*/ /***************************************************************** *# 3. Setting, checking data fields in an ESL_MSA *****************************************************************/ /* These get used by parsers, which might be using an ESL_BUFFER. * They need to handle either NUL-terminated strings or memory lines. */ /* Function: esl_msa_SetName() * Synopsis: Set name of an MSA. * * Purpose: Sets the name of the msa to string , * of length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because the MSA name is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. */ int esl_msa_SetName(ESL_MSA *msa, const char *s, esl_pos_t n) { if (msa->name) free(msa->name); if (n > 0) return esl_memstrdup(s, n, &(msa->name)); else return esl_strdup( s, -1, &(msa->name)); } /* Function: esl_msa_SetDesc() * Synopsis: Set the description line of an MSA. * * Purpose: Sets the optional description line of the msa to * string of length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because the MSA description is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. */ int esl_msa_SetDesc(ESL_MSA *msa, const char *s, esl_pos_t n) { if (msa->desc) free(msa->desc); if (n > 0) return esl_memstrdup(s, n, &(msa->desc)); else return esl_strdup( s, -1, &(msa->desc)); } /* Function: esl_msa_SetAccession() * Synopsis: Set the accession field of an MSA. * * Purpose: Sets accession field of the msa to string of * length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because the MSA accession is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. */ int esl_msa_SetAccession(ESL_MSA *msa, const char *s, esl_pos_t n) { if (msa->acc) free(msa->acc); if (n > 0) return esl_memstrdup(s, n, &(msa->acc)); else return esl_strdup( s, -1, &(msa->acc)); } /* Function: esl_msa_SetAuthor() * Synopsis: Set the author string in an MSA. * * Purpose: Sets the author string in to string of * length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because the MSA author is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. */ int esl_msa_SetAuthor(ESL_MSA *msa, const char *s, esl_pos_t n) { if (msa->au) free(msa->au); if (n > 0) return esl_memstrdup(s, n, &(msa->au)); else return esl_strdup( s, -1, &(msa->au)); } /* Function: esl_msa_SetSeqName() * Synopsis: Set an individual sequence name in an MSA. * * Purpose: Set the name of sequence number in * to string of length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * Returns: on success. * * Throws: on allocation error. * on coding errors. * * Note: msa->sqname[] is not optional, so we may * rely on it already being allocated for * i=0..sqalloc-1. */ int esl_msa_SetSeqName(ESL_MSA *msa, int idx, const char *s, esl_pos_t n) { if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINCONCEIVABLE, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (s == NULL) ESL_EXCEPTION(eslEINCONCEIVABLE, "seq names are mandatory; NULL is not a valid name"); if (msa->sqname[idx]) free(msa->sqname[idx]); if (n > 0) return esl_memstrdup(s, n, &(msa->sqname[idx])); else return esl_strdup( s, -1, &(msa->sqname[idx])); } /* Function: esl_msa_SetSeqAccession() * Synopsis: Sets individual sequence accession in an MSA. * * Purpose: Set the accession of sequence number in to * string of length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because a seq accession is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. * on coding errors. */ int esl_msa_SetSeqAccession(ESL_MSA *msa, int idx, const char *s, esl_pos_t n) { int i; int status; if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINCONCEIVABLE, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (msa->sqacc && msa->sqacc[idx]) { free(msa->sqacc[idx]); msa->sqacc[idx] = NULL; } /* erasure case */ if (! s && msa->sqacc) { for (i = 0; i < msa->sqalloc; i++) if (msa->sqacc[i]) break; if (i == msa->sqalloc) { free(msa->sqacc); msa->sqacc = NULL; } return eslOK; } /* Allocate/initialize the optional sqacc array, if it's not already done: */ if (! msa->sqacc) { ESL_ALLOC(msa->sqacc, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->sqacc[i] = NULL; } if (n > 0) status = esl_memstrdup(s, n, &(msa->sqacc[idx])); else status = esl_strdup( s, -1, &(msa->sqacc[idx])); return status; ERROR: return status; } /* Function: esl_msa_SetSeqDescription() * Synopsis: Sets individual sequence description in an MSA. * * Purpose: Set the description of sequence number in to * string of length . * * If is a NUL-terminated string, is optional; if * the length is unknown, pass . may also be a * memory line, non-NUL terminated, in which case is * required. * * can also be because a seq accession is an * optional field. (In this case, is irrelevant and * ignored.) * * Returns: on success. * * Throws: on allocation error. * on coding error */ int esl_msa_SetSeqDescription(ESL_MSA *msa, int idx, const char *s, esl_pos_t n) { int i; int status; if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINCONCEIVABLE, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (msa->sqdesc && msa->sqdesc[idx]) { free(msa->sqdesc[idx]); msa->sqdesc[idx] = NULL; } /* erasure case. If we just freed the only description, free the entire optional array */ if (! s && msa->sqdesc) { for (i = 0; i < msa->sqalloc; i++) if (msa->sqdesc[i]) break; if (i == msa->sqalloc) { free(msa->sqdesc); msa->sqdesc = NULL; } return eslOK; } /* Allocate/initialize the optional sqdesc array, if it's not already done: */ if (msa->sqdesc == NULL) { ESL_ALLOC(msa->sqdesc, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->sqdesc[i] = NULL; } if (n > 0) status = esl_memstrdup(s, n, &(msa->sqdesc[idx])); else status = esl_strdup( s, -1, &(msa->sqdesc[idx])); ERROR: return status; } /* Function: esl_msa_SetDefaultWeights() * Synopsis: Set all sequence weights to default 1.0. * * Purpose: Set all the sequence weights in to default, * 1.0. Drop the flag in flags>. * * The data structure has its values * initialized to -1.0, by create and expand functions, as * a special value for "unset yet". File format parsers use * this to tell when a weight is mistakenly set twice, or * not at all. However, when an is used, you're * allowed to assume that is valid even if the * flag is down. So all creators of new * MSAs (file format parsers, for example) must assure that * wgt> is set correctly, even if the file format * doesn't include weights. This function gives parsers * (and other MSA creators) a quick way to do this. */ int esl_msa_SetDefaultWeights(ESL_MSA *msa) { int idx; for (idx = 0; idx < msa->nseq; idx++) msa->wgt[idx] = 1.0; msa->flags &= ~eslMSA_HASWGTS; return eslOK; } /* Function: esl_msa_FormatName() * Synopsis: Format name of an MSA, printf()-style. * * Purpose: Sets the name of the msa using , where * is a -style format with * arguments; for example, . * * can be , because the MSA name is an * optional field; in which case any existing name in * the is erased. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatName(ESL_MSA *msa, const char *name, ...) { va_list ap; int status; if (msa->name != NULL) free(msa->name); if (name == NULL) { msa->name = NULL; return eslOK; } va_start(ap, name); status = esl_vsprintf(&(msa->name), name, &ap); va_end(ap); return status; } /* Function: esl_msa_FormatDesc() * Synopsis: Format the description line of an MSA, printf()-style. * * Purpose: Format the description line of the msa using . * where is a -style format with * arguments. * For example, . * * As a special case, may be , to facilitate * handling of optional annotation. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatDesc(ESL_MSA *msa, const char *desc, ...) { va_list ap; int status; if (msa->desc != NULL) free(msa->desc); va_start(ap, desc); status = esl_vsprintf(&(msa->desc), desc, &ap); va_end(ap); return status; } /* Function: esl_msa_FormatAccession() * Synopsis: Format the accession number of an MSA, printf()-style. * * Purpose: Sets accession number of the msa using , * where is a -style format with arguments. * For example, . * * As a special case, may be , to facilitate * handling of optional annotation. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatAccession(ESL_MSA *msa, const char *acc, ...) { va_list ap; int status; if (msa->acc != NULL) free(msa->acc); va_start(ap, acc); status = esl_vsprintf(&(msa->acc), acc, &ap); va_end(ap); return status; } /* Function: esl_msa_FormatAuthor() * Synopsis: Format the author string in an MSA, printf()-style. * * Purpose: Sets the author string in , using an string * and arguments in same format as would take. * * As a special case, may be , to facilitate * handling of optional annotation. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatAuthor(ESL_MSA *msa, const char *author, ...) { va_list ap; int status; if (msa->au != NULL) free(msa->au); va_start(ap, author); status = esl_vsprintf(&(msa->au), author, &ap); va_end(ap); return status; } /* Function: esl_msa_FormatSeqName() * Synopsis: Formats an individual sequence name in an MSA, printf()-style. * * Purpose: Set the name of sequence number in * to , where is a * style format and arguments. * * Returns: on success. * * Throws: if is ; * on allocation error; * if a <*printf()> library call fails. * * Note: msa->sqname[] is not optional, so we may * rely on it already being allocated for * i=0..sqalloc-1. */ int esl_msa_FormatSeqName(ESL_MSA *msa, int idx, const char *name, ...) { va_list ap; int status; if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINVAL, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (name == NULL) ESL_EXCEPTION(eslEINVAL, "seq names are mandatory; NULL is not a valid name"); if (msa->sqname[idx] != NULL) free(msa->sqname[idx]); va_start(ap, name); status = esl_vsprintf(&(msa->sqname[idx]), name, &ap); va_end(ap); return status; } /* Function: esl_msa_FormatSeqAccession() * Synopsis: Format individual sequence accession in an MSA, printf()-style. * * Purpose: Set the accession of sequence number in to * , where is a style format and * arguments. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatSeqAccession(ESL_MSA *msa, int idx, const char *acc, ...) { va_list ap; int i; int status; if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINVAL, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (acc == NULL) { if (msa->sqacc != NULL) { free(msa->sqacc[idx]); msa->sqacc[idx] = NULL; } return eslOK; } /* Allocate/initialize the optional sqacc array, if it's not already done: */ if (msa->sqacc == NULL) { ESL_ALLOC(msa->sqacc, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->sqacc[i] = NULL; } if (msa->sqacc[idx] != NULL) free(msa->sqacc[idx]); va_start(ap, acc); status = esl_vsprintf(&(msa->sqacc[idx]), acc, &ap); va_end(ap); return status; ERROR: return status; } /* Function: esl_msa_FormatSeqDescription() * Synopsis: Formats individual sequence description in an MSA, printf()-style. * * Purpose: Set the description of sequence number in to * , where may be a style format and * arguments. * * Returns: on success. * * Throws: on allocation error; * if a <*printf()> library call fails. */ int esl_msa_FormatSeqDescription(ESL_MSA *msa, int idx, const char *desc, ...) { va_list ap; int i; int status; if (idx >= msa->sqalloc) ESL_EXCEPTION(eslEINVAL, "no such sequence %d (only %d allocated)", idx, msa->sqalloc); if (desc == NULL) { if (msa->sqdesc != NULL) { free(msa->sqdesc[idx]); msa->sqdesc[idx] = NULL; } return eslOK; } /* Allocate/initialize the optional sqdesc array, if it's not already done: */ if (msa->sqdesc == NULL) { ESL_ALLOC(msa->sqdesc, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->sqdesc[i] = NULL; } if (msa->sqdesc[idx] != NULL) free(msa->sqdesc[idx]); va_start(ap, desc); status = esl_vsprintf(&(msa->sqdesc[idx]), desc, &ap); va_end(ap); return status; ERROR: return status; } /* Function: esl_msa_AddComment() * Synopsis: Add an unparsed command to an * * Purpose: Add an (unparsed) comment line to the MSA structure, * allocating as necessary. * * Args: msa - a multiple alignment * p - comment line to add * n - length of

, or -1 if

is a NUL-terminated string and length is unknown. * * Returns: on success. * * Throws: on allocation failure. */ int esl_msa_AddComment(ESL_MSA *msa, char *p, esl_pos_t n) { int status; if (n == -1) n = strlen(p); /* If this is our first recorded comment, we need to allocate; * and if we've filled available space, we need to reallocate. */ if (msa->comment == NULL) { ESL_ALLOC(msa->comment, sizeof(char *) * 16); msa->alloc_ncomment = 16; } if (msa->ncomment == msa->alloc_ncomment) { ESL_REALLOC(msa->comment, sizeof(char *) * msa->alloc_ncomment * 2); msa->alloc_ncomment *= 2; } if ((status = esl_memstrdup(p, n, &(msa->comment[msa->ncomment]))) != eslOK) goto ERROR; msa->ncomment++; return eslOK; ERROR: return status; } /* Function: esl_msa_AddGF() * Synopsis: Add an unparsed #=GF markup line to an * * Purpose: Add an unparsed \verb+#=GF+ markup line to the MSA, * allocating as necessary. is the GF markup * tag; is the text associated w/ that tag. * * Args: msa - a multiple alignment * tag - markup tag * taglen - length of ; or -1 if is a string of unknown length * value - markup text * vlen - length of ; or -1 if is a string of unknown length * * Returns: on success. * * Throws: on allocation failure. */ int esl_msa_AddGF(ESL_MSA *msa, char *tag, esl_pos_t taglen, char *value, esl_pos_t vlen) { int n; int status; if (taglen == -1) taglen = strlen(tag); if (vlen == -1) vlen = strlen(value); /* Initialize or grow the allocation? */ if (msa->ngf == msa->alloc_ngf) { n = (msa->alloc_ngf == 0 ? 16 : msa->alloc_ngf * 2); ESL_REALLOC(msa->gf_tag, sizeof(char *) * n); ESL_REALLOC(msa->gf, sizeof(char *) * n); msa->alloc_ngf = n; } if ((status = esl_memstrdup(tag, taglen, &(msa->gf_tag[msa->ngf]))) != eslOK) goto ERROR; if ((status = esl_memstrdup(value, vlen, &(msa->gf[msa->ngf]))) != eslOK) goto ERROR; msa->ngf++; return eslOK; ERROR: return status; } /* Function: esl_msa_AddGS() * Synopsis: Add an unparsed #=GS markup line to an * * Purpose: Add an unparsed \verb+#=GS+ markup line to the MSA, * allocating as necessary. It's possible that we * could get more than one of the same type of GS * tag per sequence; for example, "DR PDB;" structure * links in Pfam. Hack: handle these by appending to * the string, in a \verb+\n+ separated fashion. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "AC") * taglen - length of ; or -1 if is a string of unknown length * sqidx - index of sequence to assoc markup with (0..nseq-1) * value - markup (e.g. "P00666") * vlen - length of ; or -1 if is string of unknown length * * Returns: on success. * * Throws: on allocation failure. */ int esl_msa_AddGS(ESL_MSA *msa, char *tag, esl_pos_t taglen, int sqidx, char *value, esl_pos_t vlen) { int tagidx; int i; int status; if (taglen == -1) taglen = strlen(tag); if (vlen == -1) vlen = strlen(value); /* first GS tag? init&allocate */ if (msa->gs_tag == NULL) { msa->gs_idx = esl_keyhash_Create(); status = esl_keyhash_Store(msa->gs_idx, tag, taglen, &tagidx); if (status != eslOK && status != eslEDUP) return status; ESL_DASSERT1((tagidx == 0)); ESL_ALLOC(msa->gs_tag, sizeof(char *)); /* one at a time. */ ESL_ALLOC(msa->gs, sizeof(char **)); ESL_ALLOC(msa->gs[0], sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->gs[0][i] = NULL; } else { /* Get a tagidx for this GS tag. * tagidx < ngs; we already saw this tag; * tagidx == ngs; this is a new one. */ status = esl_keyhash_Store(msa->gs_idx, tag, taglen, &tagidx); if (status != eslOK && status != eslEDUP) return status; /* Reallocation (in blocks of 1) */ if (tagidx == msa->ngs ) { ESL_REALLOC(msa->gs_tag, (msa->ngs+1) * sizeof(char *)); ESL_REALLOC(msa->gs, (msa->ngs+1) * sizeof(char **)); msa->gs[tagidx] = NULL; ESL_ALLOC(msa->gs[tagidx], sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->gs[tagidx][i] = NULL; } } /* Store the tag, if it's new. */ if (tagidx == msa->ngs) { if ((status = esl_memstrdup(tag, taglen, &(msa->gs_tag[tagidx]))) != eslOK) goto ERROR; msa->ngs++; } /* Store the annotation on the sequence. * If seq is unannotated, dup the value; if * seq already has a GS annotation, cat a \n, then cat the value. */ if (msa->gs[tagidx][sqidx] == NULL) { if ((status = esl_memstrdup(value, vlen, &(msa->gs[tagidx][sqidx]))) != eslOK) goto ERROR; } else { esl_pos_t n1,n2; n1 = strlen(msa->gs[tagidx][sqidx]); n2 = (vlen == -1 ? strlen(value) : vlen); ESL_REALLOC(msa->gs[tagidx][sqidx], sizeof(char) * (n1+n2+2)); /* +2 for \n, \0 */ msa->gs[tagidx][sqidx][n1] = '\n'; memcpy(msa->gs[tagidx][sqidx]+n1+1, value, n2); msa->gs[tagidx][sqidx][n1+n2+1] = '\0'; } return eslOK; ERROR: return status; } /* Function: esl_msa_AppendGC() * Synopsis: Add an unparsed #=GC markup line to an * * Purpose: Add an unparsed \verb+#=GC+ markup line to the MSA * structure, allocating as necessary. When called * multiple times for the same tag, appends value * strings together -- used when parsing multiblock * alignment files, for example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "CS") * value - markup, one char per aligned column * * Returns: on success. * * Throws: on allocation failure. */ int esl_msa_AppendGC(ESL_MSA *msa, char *tag, char *value) { int tagidx; int status; void *p; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gc_tag == NULL) /* first tag? init&allocate */ { msa->gc_idx = esl_keyhash_Create(); status = esl_keyhash_Store(msa->gc_idx, tag, -1, &tagidx); if (status != eslOK && status != eslEDUP) return status; ESL_DASSERT1((tagidx == 0)); ESL_ALLOC(msa->gc_tag, sizeof(char *)); ESL_ALLOC(msa->gc, sizeof(char *)); msa->gc[0] = NULL; } else { /* new tag? */ /* get tagidx for this GC tag. existing tag: gc_idx, tag, -1, &tagidx); if (status != eslOK && status != eslEDUP) goto ERROR; /* Reallocate, in block of one tag at a time */ if (tagidx == msa->ngc) { ESL_RALLOC(msa->gc_tag, p, (msa->ngc+1) * sizeof(char **)); ESL_RALLOC(msa->gc, p, (msa->ngc+1) * sizeof(char **)); msa->gc[tagidx] = NULL; } } /* new tag? store it. */ if (tagidx == msa->ngc) { if ((status = esl_strdup(tag, -1, &(msa->gc_tag[tagidx]))) != eslOK) goto ERROR; msa->ngc++; } return (esl_strcat(&(msa->gc[tagidx]), -1, value, -1)); ERROR: return status; } /* Function: esl_msa_AppendGR() * Synopsis: Add an unparsed #=GR markup line to an * * Purpose: Add an unparsed \verb+#=GR+ markup line to the MSA structure, * allocating as necessary. * * When called multiple times for the same tag, appends * value strings together -- used when parsing multiblock * alignment files, for example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "SS") * sqidx - index of seq to assoc markup with (0..nseq-1) * value - markup, one char per aligned column * * Returns: on success. * * Throws: on allocation failure. */ int esl_msa_AppendGR(ESL_MSA *msa, char *tag, int sqidx, char *value) { void *p; int tagidx; int i; int status; if (msa->gr_tag == NULL) /* first tag? init&allocate */ { msa->gr_idx = esl_keyhash_Create(); status = esl_keyhash_Store(msa->gr_idx, tag, -1, &tagidx); if (status != eslOK && status != eslEDUP) return status; ESL_DASSERT1((tagidx == 0)); ESL_ALLOC(msa->gr_tag, sizeof(char *)); ESL_ALLOC(msa->gr, sizeof(char **)); ESL_ALLOC(msa->gr[0], sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->gr[0][i] = NULL; } else { /* get tagidx for this GR tag. existinggr_idx, tag, -1, &tagidx); if (status != eslOK && status != eslEDUP) return status; /* if a new tag, realloc for it */ if (tagidx == msa->ngr) { ESL_RALLOC(msa->gr_tag, p, (msa->ngr+1) * sizeof(char *)); ESL_RALLOC(msa->gr, p, (msa->ngr+1) * sizeof(char **)); ESL_ALLOC(msa->gr[msa->ngr], sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->gr[msa->ngr][i] = NULL; } } if (tagidx == msa->ngr) { if ((status = esl_strdup(tag, -1, &(msa->gr_tag[tagidx]))) != eslOK) goto ERROR; msa->ngr++; } return (esl_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1)); ERROR: return status; } /* Function: esl_msa_CheckUniqueNames() * Synopsis: Check if all seq names are unique. * * Purpose: Check whether all the sequence names in * are unique; if so, return , and if not, * return . * * Stockholm files require names to be unique. This * function lets us check whether we need to munge seqnames * before writing a Stockholm file. * * The check uses a keyhash, so it's efficient. * * Args: msa - alignment * * Returns: if names are unique. * if not. * * Throws: on allocation failure. */ int esl_msa_CheckUniqueNames(const ESL_MSA *msa) { ESL_KEYHASH *kh = NULL; int idx; int status = TRUE; if ((kh = esl_keyhash_Create()) == NULL) { status = eslEMEM; goto ERROR; } for (idx = 0; idx < msa->nseq; idx++) { status = esl_keyhash_Store(kh, msa->sqname[idx], -1, NULL); if (status == eslEDUP) { status = eslFAIL; break; } else if (status != eslOK) goto ERROR; } esl_keyhash_Destroy(kh); return status; ERROR: if (kh) esl_keyhash_Destroy(kh); return status; } /* msa_set_seq_ss() * * Set the secondary structure annotation for sequence number * in an alignment by copying the string . * * Returns: on success. * * Throws: on allocation failure. */ static int msa_set_seq_ss(ESL_MSA *msa, int seqidx, const char *ss) { int status; int i; if (msa->ss == NULL) { ESL_ALLOC(msa->ss, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->ss[i] = NULL; } if (msa->ss[seqidx] != NULL) free(msa->ss[seqidx]); return (esl_strdup(ss, -1, &(msa->ss[seqidx]))); ERROR: return status; } /* msa_set_seq_sa() * * Set the surface accessibility annotation for sequence number * in an alignment by copying the string . * * Returns: on success. * * Throws: on allocation failure. */ static int msa_set_seq_sa(ESL_MSA *msa, int seqidx, const char *sa) { int status; int i; if (msa->sa == NULL) { ESL_ALLOC(msa->sa, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->sa[i] = NULL; } if (msa->sa[seqidx] != NULL) free(msa->sa[seqidx]); return (esl_strdup(sa, -1, &(msa->sa[seqidx]))); ERROR: return status; } /* msa_set_seq_pp() * * Set the posterior probability annotation for sequence number * in an alignment by copying the string . * * Returns: on success. * * Throws: on allocation failure. */ static int msa_set_seq_pp(ESL_MSA *msa, int seqidx, const char *pp) { int status; int i; if (msa->pp == NULL) { ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc); for (i = 0; i < msa->sqalloc; i++) msa->pp[i] = NULL; } if (msa->pp[seqidx] != NULL) free(msa->pp[seqidx]); return (esl_strdup(pp, -1, &(msa->pp[seqidx]))); ERROR: return status; } /*---------- end of ESL_MSA field setting/checking --------------*/ /***************************************************************** *# 4. Miscellaneous functions for manipulating MSAs *****************************************************************/ static int64_t msa_get_rlen(const ESL_MSA *msa, int seqidx); /* Function: esl_msa_ReasonableRF() * Synopsis: Determine a reasonable #=RF line marking "consensus" columns. * * Purpose: Define an for the multiple alignment that * marks consensus columns with an 'x' (or the consensus * letter if useconsseq is TRUE), and non-consensus columns * with a '.'. * * Consensus columns are defined as columns with fractional * occupancy of $\geq$ in residues. For example, * if is 0.7, columns containing $\geq$ 70\% * residues are assigned as 'x' (or consensus letter) in the * , roughly speaking. "Roughly speaking", because the * fractional occupancy is in fact calculated as a weighted * frequency using sequence weights in wgt>, and because * missing data symbols are ignored in order to be able to * deal with sequence fragments. * * The greater is, the more stringent the * definition, and the fewer columns will be defined as * consensus. will define all columns as * consensus. will only define a column as * consensus if it contains no gap characters at all. * * If the caller wants to designate any sequences as * fragments, it must convert all leading and trailing gaps * to the missing data symbol '~'. * * For text mode alignments, any alphanumeric character is * considered to be a residue, and any non-alphanumeric * character is considered to be a gap. * * The is a NUL-terminated string, indexed * <0..alen-1>. * * The result can be rf>, if the caller * wants to set the own RF line; or it can be any * alternative storage provided by the caller. In either * case, the caller must provide allocated space for at * least alen+1> chars. * * Args: msa - MSA to define a consensus RF line for * symfrac - threshold for defining consensus columns * useconsseq - if FALSE, use x for a consensus position; else use the consensus letter * rfline - RESULT: string containing a letter for consensus position, . for not * * Returns: on success. * * Xref: HMMER p7_Fastmodelmaker() uses an essentially identical * calculation to define model architecture, and could be * rewritten now to use this function. * * A2M format alignment output uses this to define * consensus columns when #=RF annotation isn't available. */ int esl_msa_ReasonableRF(ESL_MSA *msa, double symfrac, int useconsseq, char *rfline) { int apos; int idx; double r; double totwgt; float *counts = NULL; int status; if (useconsseq) ESL_ALLOC(counts, msa->abc->K * sizeof(float)); if (msa->flags & eslMSA_DIGITAL) { for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; esl_vec_FSet(counts, msa->abc->K, 0.0); for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; if (useconsseq) esl_abc_FCount(msa->abc, counts, msa->ax[idx][apos], msa->wgt[idx]); } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) totwgt += msa->wgt[idx]; else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) { if (useconsseq) rfline[apos-1] = msa->abc->sym[esl_vec_FArgMax(counts, msa->abc->K)]; else rfline[apos-1] = 'x'; } else rfline[apos-1] = '.'; } } if (! (msa->flags & eslMSA_DIGITAL)) { for (apos = 0; apos < msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (isalpha(msa->aseq[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; if (useconsseq) esl_abc_FCount(msa->abc, counts, msa->abc->inmap[ (int) msa->aseq[idx][apos] ], msa->wgt[idx]); } else totwgt += msa->wgt[idx]; } if (r > 0. && r / totwgt >= symfrac) { if (useconsseq) rfline[apos-1] = msa->abc->sym[esl_vec_FArgMax(counts, msa->abc->K)]; else rfline[apos] = 'x'; } else rfline[apos] = '.'; } } rfline[msa->alen] = '\0'; if (counts) free(counts); return eslOK; ERROR: if (counts) free(counts); return status; } /* Function: esl_msa_MarkFragments() * Synopsis: Heuristic definition of sequence fragments in an alignment * Incept: SRE, Sun 14 Apr 2019 [DL4378 LHR-BOS] * * Purpose: Heuristically define sequence fragments (as opposed to * full length sequences) in . Set bit in * to if seq is a fragment, else . * * The rule is to calculate the fractional "span" of each * aligned sequence: the aligned length from its first to * last residue, divided by the total number of aligned * columns; sequence is defined as a fragment if . * * In a text mode , any alphanumeric character is * considered to be a residue, and any non-alphanumeric * char is considered to be a gap. * * is allocated here; caller frees. * * Returns: on success. * * Throws: on allocation error. * * Xref: Compare , which uses the same fragment definition * rule without calling this function, because it's used a little * differently. */ int esl_msa_MarkFragments(const ESL_MSA *msa, float fragthresh, ESL_BITFIELD **ret_fragassign) { ESL_BITFIELD *fragassign = NULL; int minspan = (int) ceil(fragthresh * (float) msa->alen); // precalculated span length threshold using int idx,lpos,rpos; int status; if (( fragassign = esl_bitfield_Create(msa->nseq)) == NULL) { status = eslEMEM; goto ERROR; } /* Digital mode */ if (msa->flags & eslMSA_DIGITAL) { for (idx = 0; idx < msa->nseq; idx++) { for (lpos = 1; lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break; for (rpos = msa->alen; rpos >= 1; rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break; /* L=0 sequence? lpos == msa->alen+1, rpos == 0; lpos > rpos. * alen=0 alignment? lpos == 1, rpos == 0; lpos > rpos. * so alispan = rpos-lpos+1 could be <= 0 if lpos > rpos */ if (rpos-lpos+1 < minspan) esl_bitfield_Set(fragassign, idx); } } /* Text mode */ else { for (idx = 0; idx < msa->nseq; idx++) { for (lpos = 0; lpos < msa->alen; lpos++) if (isalpha(msa->aseq[idx][lpos])) break; for (rpos = msa->alen-1; rpos >= 0; rpos--) if (isalpha(msa->aseq[idx][rpos])) break; if (rpos-lpos+1 < minspan) esl_bitfield_Set(fragassign, idx); } } *ret_fragassign = fragassign; return eslOK; ERROR: esl_bitfield_Destroy(fragassign); *ret_fragassign = NULL; return status; } /* Function: esl_msa_MarkFragments_old() * Synopsis: Heuristically define seq fragments in an alignment. * * Purpose: Use a heuristic to define sequence fragments (as opposed * to "full length" sequences) in alignment . * * The rule is that if the sequence has a raw (unaligned) * length not greater than times the alignment * length in columns, the sequence is defined as a fragment. * * For each fragment, all leading and trailing gap symbols * (all gaps before the first residue and after the last * residue) are converted to missing data symbols * (typically '~', but nonstandard digital alphabets may * have defined another character). * * If is 0.0, no nonempty sequence is defined * as a fragment. * * If is 1.0, all sequences are defined as * fragments. * * Args: msa - alignment in which to define and mark seq fragments * fragthresh - define frags if rlen <= fragthresh * alen. * * Returns: on success. */ int esl_msa_MarkFragments_old(ESL_MSA *msa, double fragthresh) { int i; int pos; for (i = 0; i < msa->nseq; i++) if (msa_get_rlen(msa, i) <= fragthresh * msa->alen) { if (msa->flags & eslMSA_DIGITAL) { for (pos = 1; pos <= msa->alen; pos++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[i][pos])) break; msa->ax[i][pos] = esl_abc_XGetMissing(msa->abc); } for (pos = msa->alen; pos >= 1; pos--) { if (esl_abc_XIsResidue(msa->abc, msa->ax[i][pos])) break; msa->ax[i][pos] = esl_abc_XGetMissing(msa->abc); } } if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) { if (isalnum(msa->aseq[i][pos])) break; msa->aseq[i][pos] = '~'; } for (pos = msa->alen-1; pos >= 0; pos--) { if (isalnum(msa->aseq[i][pos])) break; msa->aseq[i][pos] = '~'; } } } return eslOK; } /* Function: esl_msa_SequenceSubset() * Synopsis: Select subset of sequences into a smaller MSA. * * Purpose: Given an array (0..nseq-1) of TRUE/FALSE flags for each * sequence in an alignment ; create a new alignment containing * only those seqs which are flagged . Return a pointer * to this newly allocated alignment through . Caller is * responsible for freeing it. * * The smaller alignment might now contain columns * consisting entirely of gaps or missing data, depending * on what sequence subset was extracted. The caller may * want to immediately call on the * new alignment to clean this up. * * Unparsed GS and GR Stockholm annotation that is presumably still * valid is transferred to the new alignment. Unparsed GC, GF, and * comments that are potentially invalidated by taking the subset * of sequences are not transferred to the new MSA. * * Weights are transferred exactly. If they need to be * renormalized to some new total weight (such as the new, * smaller total sequence number), the caller must do that. * * may be in text mode or digital mode. The new MSA * in will have the same mode. * * Returns: on success, and is set to point at a new * (smaller) alignment. * * Throws: if the subset has no sequences in it; * on allocation error. * * Xref: squid's MSASmallerAlignment(), 1999. */ int esl_msa_SequenceSubset(const ESL_MSA *msa, const int *useme, ESL_MSA **ret_new) { ESL_MSA *new_msa = NULL; int nnew; /* number of seqs in the new MSA */ int oidx, nidx; /* old, new indices */ int i; int status; ESL_DASSERT1(( msa->nseq > 0 )); ESL_DASSERT1(( msa->alen >= 0 )); // This silences static checkers that think msa->alen might be -1. *ret_new = NULL; nnew = 0; for (oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) nnew++; if (nnew == 0) ESL_EXCEPTION(eslEINVAL, "No sequences selected"); /* Note that the Create() calls allocate exact space for the sequences, * so we will strcpy()/memcpy() into them below. */ if ((msa->flags & eslMSA_DIGITAL) && (new_msa = esl_msa_CreateDigital(msa->abc, nnew, msa->alen)) == NULL) {status = eslEMEM; goto ERROR; } if (! (msa->flags & eslMSA_DIGITAL) && (new_msa = esl_msa_Create(nnew, msa->alen)) == NULL) {status = eslEMEM; goto ERROR; } if (new_msa == NULL) {status = eslEMEM; goto ERROR; } /* Copy the old to the new */ for (nidx = 0, oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) { if (msa->flags & eslMSA_DIGITAL) memcpy(new_msa->ax[nidx], msa->ax[oidx], sizeof(ESL_DSQ) * (msa->alen+2)); if (! (msa->flags & eslMSA_DIGITAL)) strcpy(new_msa->aseq[nidx], msa->aseq[oidx]); if ((status = esl_strdup(msa->sqname[oidx], -1, &(new_msa->sqname[nidx]))) != eslOK) goto ERROR; new_msa->wgt[nidx] = msa->wgt[oidx]; if (msa->sqacc && msa->sqacc[oidx] && (status = esl_msa_SetSeqAccession (new_msa, nidx, msa->sqacc[oidx], -1)) != eslOK) goto ERROR; if (msa->sqdesc && msa->sqdesc[oidx] && (status = esl_msa_SetSeqDescription(new_msa, nidx, msa->sqdesc[oidx], -1)) != eslOK) goto ERROR; if (msa->ss && msa->ss[oidx] && (status = msa_set_seq_ss (new_msa, nidx, msa->ss[oidx])) != eslOK) goto ERROR; if (msa->sa && msa->sa[oidx] && (status = msa_set_seq_sa (new_msa, nidx, msa->sa[oidx])) != eslOK) goto ERROR; if (msa->pp && msa->pp[oidx] && (status = msa_set_seq_pp (new_msa, nidx, msa->pp[oidx])) != eslOK) goto ERROR; /* unparsed annotation */ for(i = 0; i < msa->ngs; i++) { if (msa->gs[i] && msa->gs[i][oidx] && (status = esl_msa_AddGS (new_msa, msa->gs_tag[i], -1, nidx, msa->gs[i][oidx], -1)) != eslOK) goto ERROR; } for(i = 0; i < msa->ngr; i++) { if (msa->gr[i] && msa->gr[i][oidx] && (status = esl_msa_AppendGR(new_msa, msa->gr_tag[i], nidx, msa->gr[i][oidx])) != eslOK) goto ERROR; } nidx++; } new_msa->flags = msa->flags; if ((status = esl_strdup(msa->name, -1, &(new_msa->name))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->desc, -1, &(new_msa->desc))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->acc, -1, &(new_msa->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->au, -1, &(new_msa->au))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->ss_cons, msa->alen, &(new_msa->ss_cons))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->sa_cons, msa->alen, &(new_msa->sa_cons))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->pp_cons, msa->alen, &(new_msa->pp_cons))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->rf, msa->alen, &(new_msa->rf))) != eslOK) goto ERROR; if ((status = esl_strdup(msa->mm, msa->alen, &(new_msa->mm))) != eslOK) goto ERROR; for (i = 0; i < eslMSA_NCUTS; i++) { new_msa->cutoff[i] = msa->cutoff[i]; new_msa->cutset[i] = msa->cutset[i]; } new_msa->nseq = nnew; new_msa->sqalloc = nnew; /* Since we have a fully constructed MSA, we don't need the * aux info used by parsers. */ if (new_msa->sqlen != NULL) { free(new_msa->sqlen); new_msa->sqlen = NULL; } if (new_msa->sslen != NULL) { free(new_msa->sslen); new_msa->sslen = NULL; } if (new_msa->salen != NULL) { free(new_msa->salen); new_msa->salen = NULL; } if (new_msa->pplen != NULL) { free(new_msa->pplen); new_msa->pplen = NULL; } new_msa->lastidx = -1; *ret_new = new_msa; return eslOK; ERROR: if (new_msa != NULL) esl_msa_Destroy(new_msa); *ret_new = NULL; return status; } /* Function: esl_msa_ColumnSubset() * Synopsis: Remove a selected subset of columns from the MSA * * Purpose: Given an array (0..alen-1) of TRUE/FALSE flags, * where TRUE means "keep this column in the new alignment"; * remove all columns annotated as FALSE in the * array. This is done in-place on the MSA, so the MSA is * modified: alen> is reduced, aseq> is shrunk * (or ax>, in the case of a digital mode alignment), * and all associated per-residue or per-column annotation * is shrunk. * * Returns: on success. * Possibilities from call: * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted SS. */ int esl_msa_ColumnSubset(ESL_MSA *msa, char *errbuf, const int *useme) { int status; int64_t opos; /* position in original alignment */ int64_t npos; /* position in new alignment */ int idx; /* sequence index */ int i; /* markup index */ /* For RNA/DNA digital alignments only: * Remove any basepairs from SS_cons and individual sequence SS * for aln columns i,j for which useme[i-1] or useme[j-1] are FALSE */ if ( msa->abc && (msa->abc->type == eslRNA || msa->abc->type == eslDNA) && (status = esl_msa_RemoveBrokenBasepairs(msa, errbuf, useme)) != eslOK) return status; /* Since we're minimizing, we can overwrite in place, within the msa * we've already got. * opos runs all the way to msa->alen to include (and move) the \0 * string terminators (or sentinel bytes, in the case of digital mode) */ for (opos = 0, npos = 0; opos <= msa->alen; opos++) { if (opos < msa->alen && useme[opos] == FALSE) continue; if (npos != opos) /* small optimization */ { /* The alignment, and per-residue annotations */ for (idx = 0; idx < msa->nseq; idx++) { if (msa->flags & eslMSA_DIGITAL) /* watch off-by-one in dsq indexing */ msa->ax[idx][npos+1] = msa->ax[idx][opos+1]; else msa->aseq[idx][npos] = msa->aseq[idx][opos]; if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][npos] = msa->ss[idx][opos]; if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][npos] = msa->sa[idx][opos]; if (msa->pp != NULL && msa->pp[idx] != NULL) msa->pp[idx][npos] = msa->pp[idx][opos]; for (i = 0; i < msa->ngr; i++) if (msa->gr[i][idx] != NULL) msa->gr[i][idx][npos] = msa->gr[i][idx][opos]; } /* The per-column annotations */ if (msa->ss_cons != NULL) msa->ss_cons[npos] = msa->ss_cons[opos]; if (msa->sa_cons != NULL) msa->sa_cons[npos] = msa->sa_cons[opos]; if (msa->pp_cons != NULL) msa->pp_cons[npos] = msa->pp_cons[opos]; if (msa->rf != NULL) msa->rf[npos] = msa->rf[opos]; if (msa->mm != NULL) msa->mm[npos] = msa->mm[opos]; for (i = 0; i < msa->ngc; i++) msa->gc[i][npos] = msa->gc[i][opos]; } npos++; } msa->alen = npos-1; /* -1 because npos includes NUL terminators */ return eslOK; } /* Function: esl_msa_MinimGaps() * Synopsis: Remove columns containing all gap symbols. * * Purpose: Remove all columns in the multiple alignment * that consist entirely of gaps or missing data. * * For a text mode alignment, is a string defining * the gap characters, such as <"-_.~">. For a digital mode * alignment, may be passed as , because the * internal alphabet already knows what the gap and missing * data characters are. * * is changed in-place to a narrower alignment * containing fewer columns. All per-residue and per-column * annotation is altered appropriately for the columns that * remain in the new alignment. * * If is TRUE, only columns that are gaps * in all sequences of and a gap in the RF annotation * of the alignment (rf>) will be removed. It is * okay if is TRUE and rf> is NULL * (no error is thrown), the function will behave as if * is FALSE. * * Returns: on success. * * Throws: on allocation failure. * Possibilities from call: * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted SS. * * Xref: squid's MSAMingap(). */ int esl_msa_MinimGaps(ESL_MSA *msa, char *errbuf, const char *gaps, int consider_rf) { int *useme = NULL; /* array of TRUE/FALSE flags for which cols to keep */ int64_t apos; /* column index */ int idx; /* sequence index */ int status; int rf_is_nongap; /* TRUE if current position is not a gap in msa->rf OR msa->rf is NULL */ if (msa->flags & eslMSA_DIGITAL) /* be careful of off-by-one: useme is 0..L-1 indexed */ { ESL_ALLOC(useme, sizeof(int) * (msa->alen+1)); /* +1 is just to deal w/ alen=0 special case */ for (apos = 1; apos <= msa->alen; apos++) { rf_is_nongap = ((msa->rf != NULL) && (! esl_abc_CIsGap (msa->abc, msa->rf[apos-1])) && (! esl_abc_CIsMissing(msa->abc, msa->rf[apos-1]))) ? TRUE : FALSE; if(rf_is_nongap && consider_rf) { /* RF is not a gap and consider_rf is TRUE, keep this column */ useme[apos-1] = TRUE; } else { /* check all seqs to see if this column is all gaps */ for (idx = 0; idx < msa->nseq; idx++) if (! esl_abc_XIsGap (msa->abc, msa->ax[idx][apos]) && ! esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) break; if (idx == msa->nseq) useme[apos-1] = FALSE; else useme[apos-1] = TRUE; } } if ((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) goto ERROR; } if (! (msa->flags & eslMSA_DIGITAL)) /* text mode case */ { if ( (status = esl_msa_MinimGapsText(msa, errbuf, gaps, consider_rf, FALSE)) != eslOK) goto ERROR; } if (useme) free(useme); return eslOK; ERROR: if (useme) free(useme); return status; } /* Function: esl_msa_MinimGapsText() * Synopsis: Remove columns containing all gap symbols, from text mode msa * * Purpose: Same as esl\_msa\_MinimGaps(), but specialized for a text mode * alignment where we don't know the alphabet. The issue is what * to do about RNA secondary structure annotation (SS, SS\_cons) * when we remove columns, which can remove one side of a bp and * invalidate the annotation string. For digital alignments, * knows the alphabet and will fix base pairs * for RNA/DNA alignments. For text mode, though, we have to * get told to do it, because the default behavior for text mode * alis is to assume that the alphabet is totally arbitrary, and we're * not allowed to make assumptions about its symbols' meaning. * Hence, the flag here. * * Ditto for the string: we don't know what symbols * are supposed to be gaps unless we're told something like * <"-_.~">. * * Args: msa - alignment to remove all-gap cols from * errbuf - if non-, space for an informative error message on failure * gaps - string of gap characters * consider_rf - if TRUE, also consider gap/nongap cols in RF annotation line * fix_bps - if TRUE, fix any broken bps in SS/SS\_cons annotation lines. * * Returns: on success. * * Throws: on allocation failure. * Possibilities from call: * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted SS. */ int esl_msa_MinimGapsText(ESL_MSA *msa, char *errbuf, const char *gaps, int consider_rf, int fix_bps) { int *useme = NULL; /* array of TRUE/FALSE flags for which cols to keep */ int64_t apos; /* column index */ int idx; /* sequence index */ int status; int rf_is_nongap; /* TRUE if current position is not a gap in msa->rf OR msa->rf is NULL */ ESL_ALLOC(useme, sizeof(int) * (msa->alen+1)); /* +1 is just to deal w/ alen=0 special case */ for (apos = 0; apos < msa->alen; apos++) { rf_is_nongap = ((msa->rf != NULL) && (strchr(gaps, msa->rf[apos]) == NULL)) ? TRUE : FALSE; if (rf_is_nongap && consider_rf) useme[apos] = TRUE; /* RF is not a gap and consider_rf is TRUE, keep this column */ else { /* check all seqs to see if this column is all gaps */ for (idx = 0; idx < msa->nseq; idx++) if (strchr(gaps, msa->aseq[idx][apos]) == NULL) break; useme[apos] = (idx == msa->nseq ? FALSE : TRUE); } } if (fix_bps && (status = esl_msa_RemoveBrokenBasepairs(msa, errbuf, useme)) != eslOK) goto ERROR; if ( (status = esl_msa_ColumnSubset (msa, errbuf, useme)) != eslOK) goto ERROR; free(useme); return eslOK; ERROR: if (useme) free(useme); return status; } /* Function: esl_msa_NoGaps() * Synopsis: Remove columns containing any gap symbol. * * Purpose: Remove all columns in the multiple alignment that * contain any gaps or missing data, such that the modified * MSA consists only of ungapped columns (a solid block of * residues). * * This is useful for filtering alignments prior to * phylogenetic analysis using programs that can't deal * with gaps. * * For a text mode alignment, is a string defining * the gap characters, such as <"-_.~">. For a digital mode * alignment, may be passed as , because the * internal alphabet already knows what the gap and * missing data characters are. * * is changed in-place to a narrower alignment * containing fewer columns. All per-residue and per-column * annotation is altered appropriately for the columns that * remain in the new alignment. * * Returns: on success. * * Throws: on allocation failure. * Possibilities from call: * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted SS. * * Xref: squid's MSANogap(). */ int esl_msa_NoGaps(ESL_MSA *msa, char *errbuf, const char *gaps) { int *useme = NULL; /* array of TRUE/FALSE flags for which cols to keep */ int64_t apos; /* column index */ int idx; /* sequence index */ int status; if (msa->flags & eslMSA_DIGITAL) /* be careful of off-by-one: useme is 0..L-1 indexed */ { ESL_ALLOC(useme, sizeof(int) * (msa->alen+1)); /* +1 is only to deal with alen=0 special case */ for (apos = 1; apos <= msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (esl_abc_XIsGap (msa->abc, msa->ax[idx][apos]) || esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) break; if (idx == msa->nseq) useme[apos-1] = TRUE; else useme[apos-1] = FALSE; } if ((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) goto ERROR; } if (! (msa->flags & eslMSA_DIGITAL)) /* text mode case */ { if ((status = esl_msa_NoGapsText(msa, errbuf, gaps, FALSE)) != eslOK) goto ERROR; } if (useme) free(useme); return eslOK; ERROR: if (useme) free(useme); return status; } /* Function: esl_msa_NoGapsText() * Synopsis: Remove columns containing any gap symbol at all, for text mode msa. * * Purpose: Like but specialized for textmode where * we don't know the alphabet, yet might need to fix alphabet-dependent * problems. * * Like , the alphabet-dependent issue we might * want to fix is RNA secondary structure annotation (SS, SS\_cons); * removing a column might remove one side of a base pair annotation, and * invalidate a secondary structure string. tells the function * that SS and SS\_cons are RNA WUSS format strings, and the function is * allowed to edit (and fix) them. Normally, in text mode msa's, we * are not allowed to interpret any meaning of symbols. * * Args: msa - alignment to remove any-gap cols from * errbuf - if non-, space for an informative error message on failure * gaps - string of gap characters * fix_bps - if TRUE, fix any broken bps in SS/SS\_cons annotation lines * * Returns: on success. * * Throws: on allocation failure. * Possibilities from call: * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted SS. */ int esl_msa_NoGapsText(ESL_MSA *msa, char *errbuf, const char *gaps, int fix_bps) { int *useme = NULL; /* array of TRUE/FALSE flags for which cols to keep */ int64_t apos; /* column index */ int idx; /* sequence index */ int status; ESL_ALLOC(useme, sizeof(int) * (msa->alen+1)); /* +1 is only to deal with alen=0 special case */ for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (strchr(gaps, msa->aseq[idx][apos]) != NULL) break; useme[apos] = (idx == msa->nseq ? TRUE : FALSE); } if (fix_bps && (status = esl_msa_RemoveBrokenBasepairs(msa, errbuf, useme)) != eslOK) goto ERROR; if ( (status = esl_msa_ColumnSubset (msa, errbuf, useme)) != eslOK) goto ERROR; free(useme); return eslOK; ERROR: if (useme) free(useme); return status; } /* Function: esl_msa_SymConvert() * Synopsis: Global search/replace of symbols in an MSA. * * Purpose: In the aligned sequences in a text-mode , convert any * residue in the string to its counterpart (at the same * position) in string . * * To convert DNA to RNA, could be "Tt" and * could be "Uu". To convert IUPAC symbols to * N's, could be "RYMKSWHBVDrymkswhbvd" and * could be "NNNNNNNNNNnnnnnnnnnn". * * As a special case, if consists of a single * character, then any character in the is * converted to this character. * * Thus, must either be of the same length as * , or of length 1. Anything else will cause * undefined behavior (and probably segfault). * * The conversion is done in-place, so the is * modified. * * This is a poor man's hack for processing text mode MSAs * into a more consistent text alphabet. It is unnecessary * for digital mode MSAs, which are already in a standard * internal alphabet. Calling on a * digital mode alignment throws an error. * * Returns: on success. * * Throws: if is in digital mode, or if the * and strings aren't valid together. */ int esl_msa_SymConvert(ESL_MSA *msa, const char *oldsyms, const char *newsyms) { int64_t apos; /* column index */ int idx; /* sequence index */ char *sptr; int special; if (msa->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "can't SymConvert a digital mode alignment"); if ((strlen(oldsyms) != strlen(newsyms)) && strlen(newsyms) != 1) ESL_EXCEPTION(eslEINVAL, "invalid newsyms/oldsyms pair"); special = (strlen(newsyms) == 1 ? TRUE : FALSE); for (idx = 0; idx < msa->nseq; idx++) for (apos = 0; apos < msa->alen; apos++) if ((sptr = strchr(oldsyms, msa->aseq[idx][apos])) != NULL) msa->aseq[idx][apos] = (special ? *newsyms : newsyms[sptr-oldsyms]); return eslOK; } /* Function: esl_msa_Checksum() * Synopsis: Calculate a checksum for an MSA. * Incept: SRE, Tue Sep 16 13:23:34 2008 [Janelia] * * Purpose: Calculates a 32-bit checksum for . * * Only the alignment data are considered, not the sequence * names or other annotation. For text mode alignments, the * checksum is case sensitive. * * This is used as a quick way to try to verify that a * given alignment is identical to an expected one; for * example, when HMMER is mapping new sequence alignments * onto exactly the same seed alignment an HMM was built * from. * * Returns: on success. * * Xref: The checksum is a modified version of Jenkin's hash; * see for the original and citations. */ int esl_msa_Checksum(const ESL_MSA *msa, uint32_t *ret_checksum) { uint32_t val = 0; int i,pos; if (msa->flags & eslMSA_DIGITAL) { for (i = 0; i < msa->nseq; i++) for (pos = 1; pos <= msa->alen; pos++) { val += msa->ax[i][pos]; val += (val << 10); val ^= (val >> 6); } } if (! (msa->flags & eslMSA_DIGITAL)) { for (i = 0; i < msa->nseq; i++) for (pos = 0; pos < msa->alen; pos++) { val += msa->aseq[i][pos]; val += (val << 10); val ^= (val >> 6); } } val += (val << 3); val ^= (val >> 11); val += (val << 15); *ret_checksum = val; return eslOK; } /* Function: esl_msa_RemoveBrokenBasepairsFromSS() * Synopsis: Remove basepairs about to be broken by a column downselect. * * Purpose: Given an array (0..alen-1) of TRUE/FALSE flags, * remove any basepair from an SS string that is between * alignment columns (i,j) for which either or * is FALSE. Called by * . * * The input SS string will be overwritten. If it was not * in full WUSS format when passed in, it will be upon * exit. Note that that means if there's residues in the * input ss that correspond to gaps in an aligned sequence * or RF annotation, they will not be treated as gaps in * the returned SS. For example, a gap may become a '-' * character, a '<_>' character, or a ':' character. I'm not * sure how to deal with this in a better way. We could * demand an aligned sequence to use to de-gap the SS * string, but that would require disallowing any gap to be * involved in a basepair, which I'm not sure is something * we want to forbid. * * If the original SS is inconsistent it's left untouched * and non- is returned as listed below. * * Returns: on success. * if SS string * if a derived ct array implies a pknotted * SS, this should be impossible. * * Throws: on allocation failure. */ int esl_msa_RemoveBrokenBasepairsFromSS(char *ss, char *errbuf, int len, const int *useme) { int64_t apos; /* alignment position */ int *ct = NULL; /* 0..alen-1 base pair partners array for current sequence */ int status; ESL_ALLOC(ct, sizeof(int) * (len+1)); if ((status = esl_wuss2ct(ss, len, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent."); for (apos = 1; apos <= len; apos++) { if (!(useme[apos-1])) { if (ct[apos] != 0) ct[ct[apos]] = 0; ct[apos] = 0; } } /* All broken bps removed from ct, convert to WUSS SS string and overwrite SS */ if ((status = esl_ct2wuss(ct, len, ss)) != eslOK) ESL_FAIL(status, errbuf, "Error converting de-knotted bp ct array to WUSS notation."); free(ct); return eslOK; ERROR: if (ct != NULL) free(ct); return status; } /* Function: esl_msa_RemoveBrokenBasepairs() * Synopsis: Remove all annotated bps about to be broken by column downselect. * * Purpose: Given an array (0..alen-1) of TRUE/FALSE flags, * remove any basepair from and individual SS * annotation in alignment columns (i,j) for which either * or is FALSE. Called * automatically from with same * . * * If the original structure data is inconsistent it's left * untouched. * * Returns: on success. * if WUSS string for or ss> * following is inconsistent. * if a derived ct array implies a pknotted * SS, this should be impossible * * Throws: on allocation failure. */ int esl_msa_RemoveBrokenBasepairs(ESL_MSA *msa, char *errbuf, const int *useme) { int status; int i; if (msa->ss_cons) { if((status = esl_msa_RemoveBrokenBasepairsFromSS(msa->ss_cons, errbuf, msa->alen, useme)) != eslOK) return status; } /* per-seq SS annotation */ if (msa->ss) { for(i = 0; i < msa->nseq; i++) { if (msa->ss[i]) { if ((status = esl_msa_RemoveBrokenBasepairsFromSS(msa->ss[i], errbuf, msa->alen, useme)) != eslOK) return status; } } } return eslOK; } /* msa_get_rlen() * * Returns the raw (unaligned) length of sequence number * in . */ static int64_t msa_get_rlen(const ESL_MSA *msa, int seqidx) { int64_t rlen = 0; int pos; if (msa->flags & eslMSA_DIGITAL) rlen = esl_abc_dsqrlen(msa->abc, msa->ax[seqidx]); if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) if (isalnum(msa->aseq[seqidx][pos])) rlen++; } return rlen; } /* Function: esl_msa_ReverseComplement() * Synopsis: Reverse complement a multiple alignment * Incept: SRE, Wed Feb 10 12:52:13 2016 [JB251 BOS-MCO] * * Purpose: Reverse complement the multiple alignment , in place. * * must be in digital mode, and it must be in an alphabet * that permits reverse complementation (, ). * * In addition to reverse complementing the sequence data, * per-column and per-residue annotation also gets reversed * or reverse complemented. Secondary structure annotations * (the consensus structure , and any individual * structures ) are assumed to be in WUSS format, * and are "reverse complemented" using * . Other annotations are assumed to * be textual, and are simply reversed. Beware, because * this can go awry if an optional or annotation * has semantics that would require complementation (an RNA * structure annotation, for example). * * Returns: on success. * * Throws: if isn't digital, or isn't in an alphabet * that allows reverse complementation. */ int esl_msa_ReverseComplement(ESL_MSA *msa) { int i; int m; int status; if (! (msa->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINCOMPAT, "msa isn't digital"); if ( msa->abc->complement == NULL) ESL_EXCEPTION(eslEINCOMPAT, "msa alphabet can't be reverse complemented"); if (msa->ss_cons) esl_wuss_reverse(msa->ss_cons, msa->ss_cons); if (msa->sa_cons) esl_vec_CReverse(msa->sa_cons, msa->sa_cons, msa->alen); if (msa->pp_cons) esl_vec_CReverse(msa->pp_cons, msa->pp_cons, msa->alen); if (msa->rf) esl_vec_CReverse(msa->rf, msa->rf, msa->alen); if (msa->mm) esl_vec_CReverse(msa->mm, msa->mm, msa->alen); for (m = 0; m < msa->ngc; m++) if (msa->gc && msa->gc[m]) esl_vec_CReverse(msa->gc[m], msa->gc[m], msa->alen); for (i = 0; i < msa->nseq; i++) { if ((status = esl_abc_revcomp(msa->abc, msa->ax[i], msa->alen)) != eslOK) goto ERROR; if (msa->ss && msa->ss[i]) esl_wuss_reverse(msa->ss[i], msa->ss[i]); if (msa->sa && msa->sa[i]) esl_vec_CReverse(msa->sa[i], msa->sa[i], msa->alen); if (msa->pp && msa->pp[i]) esl_vec_CReverse(msa->pp[i], msa->pp[i], msa->alen); } for (m = 0; m < msa->ngr; m++) for (i = 0; i < msa->nseq; i++) if (msa->gr && msa->gr[m] && msa->gr[m][i]) esl_vec_CReverse(msa->gr[m][i], msa->gr[m][i], msa->alen); return eslOK; ERROR: return status; } /* Function: esl_msa_Hash() * Synopsis: Hash sequence names, internally, for faster access/lookup. * * Purpose: Caller wants to map sequence names to integer index in the * structure, using the internal index> keyhash. * Create (or recreate) that index. * * Each sequence name must be unique. If not, returns * , and index> is (if it already * existed, it is destroyed). * * Returns: on success, and index> is available for * keyhash lookups. * * if any sequence names are duplicated, and * index> is . * * Throws: on allocation error. */ int esl_msa_Hash(ESL_MSA *msa) { int idx; int status; if (msa->index) esl_keyhash_Reuse(msa->index); else msa->index = esl_keyhash_Create(); if (! msa->index) { status = eslEMEM; goto ERROR; } for (idx = 0; idx < msa->nseq; idx++) if ((status = esl_keyhash_Store(msa->index, msa->sqname[idx], -1, NULL)) != eslOK) goto ERROR; return eslOK; ERROR: if (msa->index) { esl_keyhash_Destroy(msa->index); msa->index = NULL; } return status; } /* Function: esl_msa_FlushLeftInserts() * Synopsis: Make all insertions flush left. * Incept: SRE, Tue Apr 25 17:04:47 2017 [Cambridge Dance Complex] * * Purpose: Given an alignment with reference annotation * defining consensus (match) columns versus insertions, * make all the insertions flush left (i.e. in each * insertion position in the consensus, gaps follow * inserted residues). * * This makes profile multiple alignments unique w.r.t. * arbitrary gap ordering in insert regions, since profile * alignments don't specify any alignment of insertions. * A2M alignment format, for example, does not specify any * insert alignment. * * Returns: on success. is altered, rearranging the * order of gaps and inserted residues. * * Throws: if has no reference annotation. * * Note: Written for . */ int esl_msa_FlushLeftInserts(ESL_MSA *msa) { int i, a, b; ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL )); ESL_DASSERT1(( msa->abc != NULL )); ESL_DASSERT1(( msa->ax != NULL )); if (! msa->rf) ESL_EXCEPTION(eslEINVAL, "msa has no reference annotation"); for (i = 0; i < msa->nseq; i++) { for (a = 1, b = 1; a <= msa->alen; a++) { if ( ! esl_abc_CIsGap(msa->abc, msa->rf[a-1]) ) // if column[a] is consensus: { for (; b < a; b++) msa->ax[i][b] = esl_abc_XGetGap(msa->abc); } // catch b up to a, adding gaps else if (esl_abc_XIsGap(msa->abc, msa->ax[i][a])) // else if column[a] is nonconsensus, and ax[a] is a gap: continue; // do nothing, just advance to next a msa->ax[i][b++] = msa->ax[i][a]; // copy a consensus position, or nonconsensus residue } for (; b <= msa->alen; b++) msa->ax[i][b] = esl_abc_XGetGap(msa->abc); // finally, catch b up to end of alignment, adding gaps as needed. } return eslOK; } /*----------------- end of misc MSA functions -------------------*/ /***************************************************************** * 5. Debugging, testing, development *****************************************************************/ /* Function: esl_msa_Validate() * Synopsis: Validate an ESL_MSA structure. * * Purpose: Validates the fields of the structure * . Makes sure required information is present, * consistent. If so, return . * * If a problem is detected, return . Caller may * also provide an optional pointer to a buffer of * at least ; if this message buffer is * provided, an informative error message is put there. * * Args: msa - MSA structure to validate * errmsg - OPTIONAL: error message buffer, at least ; or * * Returns: on success, and (if provided) is set * to an empty string. * * on failure and (if provided) contains * the reason for the failure. */ int esl_msa_Validate(const ESL_MSA *msa, char *errmsg) { int idx; if (msa->nseq == 0) ESL_FAIL(eslFAIL, errmsg, "no alignment data found"); for (idx = 0; idx < msa->nseq; idx++) { if (msa->flags & eslMSA_DIGITAL) { if (! msa->ax || ! msa->ax[idx]) ESL_FAIL(eslFAIL, errmsg, "seq %d: no sequence", idx); if (esl_abc_dsqlen(msa->ax[idx]) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "seq %d: wrong length", idx); } if (! (msa->flags & eslMSA_DIGITAL)) { if (! msa->aseq || ! msa->aseq[idx]) ESL_FAIL(eslFAIL, errmsg, "seq %d: no sequence", idx); if (strlen(msa->aseq[idx]) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "seq %d: wrong length", idx); } /* either all weights must be set, or none of them */ if ( msa->flags & eslMSA_HASWGTS) { if (msa->wgt[idx] == -1.0) ESL_FAIL(eslFAIL, errmsg, "seq %d: no weight set", idx);} else { if (msa->wgt[idx] != 1.0) ESL_FAIL(eslFAIL, errmsg, "seq %d: HASWGTS flag down, wgt must be default", idx); } if (msa->ss && msa->ss[idx] && strlen(msa->ss[idx]) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "seq %d: SS wrong length", idx); if (msa->sa && msa->sa[idx] && strlen(msa->sa[idx]) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "seq %d: SA wrong length", idx); if (msa->pp && msa->pp[idx] && strlen(msa->pp[idx]) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "seq %d: PP wrong length", idx); } /* if cons SS is present, must have length right */ if (msa->ss_cons && strlen(msa->ss_cons) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "SS_cons wrong length"); if (msa->sa_cons && strlen(msa->sa_cons) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "SA_cons wrong length"); if (msa->pp_cons && strlen(msa->pp_cons) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "PP_cons wrong length"); if (msa->rf && strlen(msa->rf) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "RF wrong length"); if (msa->mm && strlen(msa->mm ) != msa->alen) ESL_FAIL(eslFAIL, errmsg, "MM wrong length"); return eslOK; } /* Function: esl_msa_Compare() * Synopsis: Compare two MSAs for equality. * * Purpose: Returns if the mandatory and optional contents * of MSAs and are identical; otherwise return * . * * Only mandatory and parsed optional information is * compared. Unparsed Stockholm markup is not compared. */ int esl_msa_Compare(ESL_MSA *a1, ESL_MSA *a2) { if (esl_msa_CompareMandatory(a1, a2) != eslOK) return eslFAIL; if (esl_msa_CompareOptional(a1, a2) != eslOK) return eslFAIL; return eslOK; } /* Function: esl_msa_CompareMandatory() * Synopsis: Compare mandatory subset of MSA contents. * Incept: SRE, Wed Jun 13 09:42:56 2007 [Janelia] * * Purpose: Compare mandatory contents of two MSAs, and . * This comprises (or , for a digital alignment); * , , , , and . * * Returns: if the MSAs are identical; * if they are not. */ int esl_msa_CompareMandatory(ESL_MSA *a1, ESL_MSA *a2) { int i; if (a1->nseq != a2->nseq) return eslFAIL; if (a1->alen != a2->alen) return eslFAIL; if (a1->flags != a2->flags) return eslFAIL; for (i = 0; i < a1->nseq; i++) { if (strcmp(a1->sqname[i], a2->sqname[i]) != 0) return eslFAIL; if (esl_DCompare(a1->wgt[i], a2->wgt[i], 0.001) != eslOK) return eslFAIL; if ((a1->flags & eslMSA_DIGITAL) && memcmp(a1->ax[i], a2->ax[i], sizeof(ESL_DSQ) * (a1->alen+2)) != 0) return eslFAIL; if (! (a1->flags & eslMSA_DIGITAL) && strcmp(a1->aseq[i], a2->aseq[i]) != 0) return eslFAIL; } return eslOK; } /* Function: esl_msa_CompareOptional() * Synopsis: Compare optional subset of MSA contents. * Incept: SRE, Wed Jun 13 09:52:48 2007 [Janelia] * * Purpose: Compare optional contents of two MSAs, and . * * Returns: if the MSAs are identical; * if they are not. */ int esl_msa_CompareOptional(ESL_MSA *a1, ESL_MSA *a2) { int i; if (esl_CCompare(a1->name, a2->name) != eslOK) return eslFAIL; if (esl_CCompare(a1->desc, a2->desc) != eslOK) return eslFAIL; if (esl_CCompare(a1->acc, a2->acc) != eslOK) return eslFAIL; if (esl_CCompare(a1->au, a2->au) != eslOK) return eslFAIL; if (esl_CCompare(a1->ss_cons, a2->ss_cons) != eslOK) return eslFAIL; if (esl_CCompare(a1->sa_cons, a2->sa_cons) != eslOK) return eslFAIL; if (esl_CCompare(a1->pp_cons, a2->pp_cons) != eslOK) return eslFAIL; if (esl_CCompare(a1->rf, a2->rf) != eslOK) return eslFAIL; if (esl_CCompare(a1->mm, a2->mm) != eslOK) return eslFAIL; if (a1->sqacc != NULL && a2->sqacc != NULL) { for (i = 0; i < a1->nseq; i++) if (esl_CCompare(a1->sqacc[i], a2->sqacc[i]) != eslOK) return eslFAIL; } else if (a1->sqacc != NULL || a2->sqacc != NULL) return eslFAIL; if (a1->sqdesc != NULL && a2->sqdesc != NULL) { for (i = 0; i < a1->nseq; i++) if (esl_CCompare(a1->sqdesc[i], a2->sqdesc[i]) != eslOK) return eslFAIL; } else if (a1->sqdesc != NULL || a2->sqdesc != NULL) return eslFAIL; if (a1->ss != NULL && a2->ss != NULL) { for (i = 0; i < a1->nseq; i++) if (esl_CCompare(a1->ss[i], a2->ss[i]) != eslOK) return eslFAIL; } else if (a1->ss != NULL || a2->ss != NULL) return eslFAIL; if (a1->sa != NULL && a2->sa != NULL) { for (i = 0; i < a1->nseq; i++) if (esl_CCompare(a1->sa[i], a2->sa[i]) != eslOK) return eslFAIL; } else if (a1->sa != NULL || a2->sa != NULL) return eslFAIL; if (a1->pp != NULL && a2->pp != NULL) { for (i = 0; i < a1->nseq; i++) if (esl_CCompare(a1->pp[i], a2->pp[i]) != eslOK) return eslFAIL; } else if (a1->pp != NULL || a2->pp != NULL) return eslFAIL; for (i = 0; i < eslMSA_NCUTS; i++) { if (a1->cutset[i] && a2->cutset[i]) { if (esl_FCompare(a1->cutoff[i], a2->cutoff[i], 0.01) != eslOK) return eslFAIL; } else if (a1->cutset[i] || a2->cutset[i]) return eslFAIL; } return eslOK; } /* Function: esl_msa_Sample() * Synopsis: Sample a random, ugly for test purposes. * Incept: SRE, Fri Apr 21 09:51:45 2017 [Culprit 1, Strings Outro] * * Purpose: Sample a random digital MSA with a random depth 1.. * random sequences and a random alignment length * 1... Return the MSA via <*ret_msa>. Caller is * responsible for free'ing it with . * * Currently generates aligned sequences , names * plus a randomly generated reference/consensus * annotation line. Sets mandatory weights to default all * 1.0. * * Returns: on success, and <*ret_msa> is the sampled MSA. * * Throws: on allocation failure, and <*ret_msa> is . * * Notes: I wrote this for A2M format unit tests, which is why it's * not sampling random weights (A2M doesn't store weights), * and why it samples a reference annotation line (A2M * always implies one). If you extend this for other * purposes, make sure you don't break the A2M format * tests. We may even want to pull this out into its own * module, so we can write something a bit for flexible for * passing options to it, to control what it does and * doesn't generate. */ int esl_msa_Sample(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, int max_nseq, int max_alen, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; int nseq = 1 + esl_rnd_Roll(rng, max_nseq); // 1..max_nseq int alen = 1 + esl_rnd_Roll(rng, max_alen); // 1..max_alen int maxn = 30; double pgap = 0.1; double pdegen = 0.02; double pcons = 0.7; char *buf = NULL; int n; int i,pos; int status; if ((msa = esl_msa_CreateDigital(abc, nseq, alen)) == NULL) { status = eslEMEM; goto ERROR; } /* Randomized digital sequence alignment */ for (i = 0; i < nseq; i++) { msa->ax[i][0] = msa->ax[i][alen+1] = eslDSQ_SENTINEL; for (pos = 1; pos <= alen; pos++) { if (esl_random(rng) < pgap) msa->ax[i][pos] = abc->K; else if (esl_random(rng) < pdegen) msa->ax[i][pos] = (abc->K+1) + esl_rnd_Roll(rng, abc->Kp-abc->K-3); else msa->ax[i][pos] = esl_rnd_Roll(rng, abc->K); } } /* Random names */ ESL_ALLOC(buf, sizeof(char) * (maxn+1)); for (i = 0; i < nseq; i++) { do { n = 1 + esl_rnd_Roll(rng, maxn); // 1..maxn esl_rsq_Sample(rng, eslRSQ_SAMPLE_GRAPH, n, &buf); // one word, no space } while (ispunct(buf[0])); // #, // are bad things for names to start with esl_msa_SetSeqName(msa, i, buf, n); } /* Random reference/consensus line */ ESL_ALLOC(msa->rf, sizeof(char) * (alen+1)); for (pos = 0; pos < alen; pos++) msa->rf[pos] = (esl_random(rng) < pcons ? 'x' : '.'); msa->rf[alen] = '\0'; /* Set weights to default 1.0 */ esl_msa_SetDefaultWeights(msa); free(buf); *ret_msa = msa; return eslOK; ERROR: if (buf) free(buf); esl_msa_Destroy(msa); *ret_msa = NULL; return status; } /*---------------- end of debugging/development routines -------------------*/ /****************************************************************************** * 15. Unit tests *****************************************************************************/ #ifdef eslMSA_TESTDRIVE #include "esl_msafile.h" /* write_known_msa() * Write a known MSA to a tmpfile in Stockholm format. */ static void write_known_msa(FILE *ofp) { fprintf(ofp, "# STOCKHOLM 1.0\n"); fprintf(ofp, "seq1 --ACDEFGHIK~LMNPQRS-TVWY\n"); fprintf(ofp, "seq2 aaACDEFGHIK~LMNPQRS-TVWY\n"); fprintf(ofp, "seq3 aaACDEFGHIK~LMNPQRS-TVWY\n"); fprintf(ofp, "\n"); fprintf(ofp, "seq1 ACDEFGHIKLMNPQRSTVWY~~~\n"); fprintf(ofp, "seq2 ACDEFGHIKLMNPQRSTVWYyyy\n"); fprintf(ofp, "seq3 ACDEFGHIKLMNPQRSTVWYyyy\n"); fprintf(ofp, "//\n"); return; } /* compare_to_known() * SRE, Thu Sep 7 09:52:07 2006 [Janelia] * Spotcheck an ESL_MSA to make sure it matches the test known alignment. */ static void compare_to_known(ESL_MSA *msa) { if (msa->alen != 47) esl_fatal("bad alen"); if (msa->nseq != 3) esl_fatal("bad nseq"); if (strcmp(msa->sqname[1], "seq2") != 0) esl_fatal("bad sqname"); if (msa->flags & eslMSA_DIGITAL) { if (! esl_abc_XIsGap(msa->abc, msa->ax[0][2])) esl_fatal("no gap where expected"); if (! esl_abc_XIsMissing(msa->abc, msa->ax[0][47])) esl_fatal("no missing-data symbol where expected"); if (msa->ax[1][1] != 0) esl_fatal("spotcheck on ax failed"); /* 0=A */ if (msa->ax[1][47] != 19) esl_fatal("spotcheck on ax failed"); /*19=Y */ } if (! (msa->flags & eslMSA_DIGITAL)) { if (strcasecmp(msa->aseq[0], "--ACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWY~~~") != 0) esl_fatal("aseq 0 is bad"); if (strcasecmp(msa->aseq[1], "aaACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWYyyy") != 0) esl_fatal("aseq 1 is bad"); if (strcasecmp(msa->aseq[2], "aaACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWYyyy") != 0) esl_fatal("aseq 2 is bad"); } return; } /* Unit tests for every function in the exposed API */ static void utest_Create(void) { ESL_MSA *msa = NULL; msa = esl_msa_Create(16, -1); /* nseq blocksize 16, growable */ esl_msa_Destroy(msa); msa = esl_msa_Create(16, 100); /* nseq=16, alen=100, not growable */ esl_msa_Destroy(msa); return; } static void utest_Sizeof(ESL_RANDOMNESS *rng) { const char msg[] = "utest_Sizeof() failed"; ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); ESL_MSA *msa = NULL; int nseq = 100; int alen = 200; size_t n = 0; if (esl_msa_Sample(rng, abc, nseq, alen, &msa) != eslOK) esl_fatal(msg); if ((n = esl_msa_Sizeof(msa)) <= 0) esl_fatal(msg); esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); } static void utest_Destroy(void) { ESL_ALPHABET *abc = NULL; ESL_MSA *msa = NULL; msa = esl_msa_Create(16, -1); esl_msa_Destroy(msa); /* normal usage */ abc = esl_alphabet_Create(eslRNA); msa = esl_msa_CreateDigital(abc, 16, 100); esl_msa_Destroy(msa); /* normal usage, digital mode */ esl_alphabet_Destroy(abc); esl_msa_Destroy(NULL); /* should tolerate NULL argument */ return; } static void utest_Expand(void) { ESL_ALPHABET *abc = NULL; ESL_MSA *msa = NULL; msa = esl_msa_Create(16, -1); /* growable */ if (esl_msa_Expand(msa) != eslOK) esl_fatal("Expand failed"); /* expand by 2x in nseq */ esl_msa_Destroy(msa); msa = esl_msa_Create(16, 100); /* not growable */ #ifdef eslTEST_THROWING if (esl_msa_Expand(msa) != eslEINVAL) esl_fatal("Expand should have failed but didn't"); /* should fail w/ EINVAL*/ #endif esl_msa_Destroy(msa); abc = esl_alphabet_Create(eslDNA); msa = esl_msa_CreateDigital(abc, 16, -1); /* growable */ if (esl_msa_Expand(msa) != eslOK) esl_fatal("Expand failed"); /* expand by 2x in nseq */ esl_msa_Destroy(msa); msa = esl_msa_CreateDigital(abc, 16, 100); /* not growable */ #ifdef eslTEST_THROWING if (esl_msa_Expand(msa) != eslEINVAL) esl_fatal("Expand should have failed but didn't"); /* should fail w/ EINVAL*/ #endif /* eslTEST_THROWING*/ esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); return; } static void utest_CreateDigital(ESL_ALPHABET *abc) { char *msg = "CreateDigital() unit test failure"; ESL_MSA *msa = NULL; msa = esl_msa_CreateDigital(abc, 16, -1); /* nseq blocksize 16, growable */ if (! (msa->flags & eslMSA_DIGITAL)) esl_fatal(msg); if (msa->ax == NULL) esl_fatal(msg); if (msa->aseq != NULL) esl_fatal(msg); if (esl_msa_Expand(msa) != eslOK) esl_fatal(msg); esl_msa_Destroy(msa); msa = esl_msa_CreateDigital(abc, 16, 100); /* nseq=16, alen=100, not growable */ #ifdef eslTEST_THROWING if (esl_msa_Expand(msa) != eslEINVAL) esl_fatal(msg); /* shouldn't grow */ #endif esl_msa_Destroy(msa); return; } static void utest_Digitize(ESL_ALPHABET *abc, char *filename) { char *msg = "Digitize() unit test failure"; ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; int c, i, pos; /* Get ourselves a copy of the known alignment that we can muck with */ if (esl_msafile_Open(NULL, filename, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); /* Deliberately corrupt it with inval character in the middle */ i = msa->nseq / 2; pos = msa->alen / 2; c = msa->aseq[i][pos]; msa->aseq[i][pos] = '%'; if (esl_msa_Digitize(abc, msa, NULL) != eslEINVAL) esl_fatal(msg); /* should detect corruption as normal error */ msa->aseq[i][pos] = c; /* restore original */ compare_to_known(msa); if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg); /* should be fine now */ compare_to_known(msa); esl_msa_Destroy(msa); return; } static void utest_Textize(ESL_ALPHABET *abc, char *filename) { char *msg = "Textize() unit test failure"; ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; if (esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg); compare_to_known(msa); esl_msafile_Close(mfp); esl_msa_Destroy(msa); return; } static void utest_SequenceSubset(ESL_MSA *m1) { char *msg = "SequenceSubset() unit test failure"; ESL_MSA *m2 = NULL; int *useme = NULL; int i,j; int n2; /* Make every other sequence (1,3..) get excluded from the subset */ useme = malloc(m1->nseq * sizeof(int)); for (i = 0, n2 = 0; i < m1->nseq; i++) if (i%2 == 0) { useme[i] = TRUE; n2++; } else useme[i] = FALSE; if (esl_msa_SequenceSubset(m1, useme, &m2) != eslOK) esl_fatal(msg); if (m2->nseq != n2) esl_fatal(msg); for (i = 0, j = 0; i < m1->nseq; i++) { if (useme[i]) { if (strcmp(m1->sqname[i], m2->sqname[j]) != 0) esl_fatal(msg); if (! (m1->flags & eslMSA_DIGITAL) && (strcmp(m1->aseq[i], m2->aseq[j]) != 0)) esl_fatal(msg); if ( (m1->flags & eslMSA_DIGITAL) && memcmp(m1->ax[i], m2->ax[j], sizeof(ESL_DSQ) * (m1->alen+2)) != 0) esl_fatal(msg); j++; } } esl_msa_Destroy(m2); free(useme); return; } static void utest_MinimGaps(char *tmpfile) { char *msg = "MinimGaps() unit test failure"; ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = NULL; if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); if (esl_msa_MinimGaps(msa, NULL, "-~", FALSE) != eslOK) esl_fatal(msg); if (msa->alen != 45) esl_fatal(msg); /* orig =47, with one all - column and one all ~ column */ if (msa->aseq[0][11] != 'L') esl_fatal(msg); /* L shifted from column 13->12 */ if (msa->aseq[0][18] != 'T') esl_fatal(msg); /* T shifted from column 21->19 */ esl_msa_Destroy(msa); if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg); if (esl_msafile_Open(&abc, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); if (esl_msa_MinimGaps(msa, NULL, NULL, FALSE) != eslOK) esl_fatal(msg); if (msa->alen != 45) esl_fatal(msg); /* orig =47, with one all - column and one all ~ column */ if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg); if (msa->aseq[0][11] != 'L') esl_fatal(msg); /* L shifted from column 13->12 */ if (msa->aseq[0][18] != 'T') esl_fatal(msg); /* T shifted from column 21->19 */ esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); return; } static void utest_NoGaps(char *tmpfile) { char *msg = "NoGaps() unit test failure"; ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = NULL; if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); if (esl_msa_NoGaps(msa, NULL, "-~") != eslOK) esl_fatal(msg); if (msa->alen != 40) esl_fatal(msg); /* orig =47, w/ 7 columns with gaps */ if (msa->aseq[0][9] != 'L') esl_fatal(msg); /* L shifted from column 13->10 */ if (msa->aseq[0][16] != 'T') esl_fatal(msg); /* T shifted from column 21->17 */ if (msa->aseq[0][39] != 'Y') esl_fatal(msg); /* Y shifted from column 47->40 */ esl_msa_Destroy(msa); if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg); if (esl_msafile_Open(&abc, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); if (esl_msa_NoGaps(msa, NULL, NULL) != eslOK) esl_fatal(msg); if (msa->alen != 40) esl_fatal(msg); /* orig =47, with one all - column and one all ~ column */ if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg); if (msa->aseq[0][9] != 'L') esl_fatal(msg); /* L shifted from column 13->10 */ if (msa->aseq[0][16] != 'T') esl_fatal(msg); /* T shifted from column 21->17 */ if (msa->aseq[0][39] != 'Y') esl_fatal(msg); /* Y shifted from column 47->40 */ esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); return; } static void utest_MarkFragments(void) { char msg[] = "esl_msa MarkFragments utest failed"; ESL_BITFIELD *fragassign = NULL; ESL_ALPHABET *abc = esl_alphabet_Create(eslRNA); int i; ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n" "seq1 AAAAAAAAAAAAAAAAAAAA\n" "seq2 .........AAAAAAAAAAA\n" "seq3 ..........AAAAAAAAAA\n" "seq4 ...........AAAAAAAAA\n" "//\n", eslMSAFILE_STOCKHOLM); while (1) // first pass: text mode. second pass: digital mode. { /* At fragthresh of 0.5, seqs with aspan/alen >= 10/20 cols are "full length" * ... that's seq1, seq2, seq3. */ if ( esl_msa_MarkFragments(msa, 0.5, &fragassign) != eslOK) esl_fatal(msg); if ( esl_bitfield_IsSet(fragassign, 0)) esl_fatal(msg); if ( esl_bitfield_IsSet(fragassign, 1)) esl_fatal(msg); if ( esl_bitfield_IsSet(fragassign, 2)) esl_fatal(msg); if (!esl_bitfield_IsSet(fragassign, 3)) esl_fatal(msg); esl_bitfield_Destroy(fragassign); /* At fragthresh of 0, all seqs are "full length" */ if ( esl_msa_MarkFragments(msa, 0., &fragassign) != eslOK) esl_fatal(msg); for (i = 0; i < msa->nseq; i++) if (esl_bitfield_IsSet(fragassign, i)) esl_fatal(msg); esl_bitfield_Destroy(fragassign); /* At fragthresh of 1, only seq1 is "full length" */ if ( esl_msa_MarkFragments(msa, 1.0, &fragassign) != eslOK) esl_fatal(msg); if ( esl_bitfield_IsSet(fragassign, 0)) esl_fatal(msg); for (i = 1; i < msa->nseq; i++) if (! esl_bitfield_IsSet(fragassign, i)) esl_fatal(msg); esl_bitfield_Destroy(fragassign); if (msa->flags & eslMSA_DIGITAL) break; if ( esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg); } esl_alphabet_Destroy(abc); esl_msa_Destroy(msa); } static void utest_SymConvert(char *tmpfile) { char *msg = "SymConvert() unit test failure"; ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = NULL; if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); /* many->one version */ if (esl_msa_SymConvert(msa, "VWY", "-") != eslOK) esl_fatal(msg); /* 6 columns convert to all-gap: now 8/47 */ if (esl_msa_MinimGaps(msa, NULL, "-~", FALSE) != eslOK) esl_fatal(msg); /* now we're 39 columns long */ if (msa->alen != 39) esl_fatal(msg); /* many->many version */ if (esl_msa_SymConvert(msa, "DEF", "VWY") != eslOK) esl_fatal(msg); if (msa->aseq[0][4] != 'V') esl_fatal(msg); if (msa->aseq[0][5] != 'W') esl_fatal(msg); if (msa->aseq[0][23] != 'Y') esl_fatal(msg); /* F in orig col 29; -5; converted to Y */ /* bad calls */ #ifdef eslTEST_THROWING if (esl_msa_SymConvert(msa, "XXX", "XX") != eslEINVAL) esl_fatal(msg); /* check for clean fail on mismatched args */ #endif esl_msa_Destroy(msa); if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg); if (esl_msafile_Open(&abc, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); #ifdef eslTEST_THROWING if (esl_msa_SymConvert(msa, "Tt", "Uu") != eslEINVAL) esl_fatal(msg); /* must cleanly fail on digital mode msa */ #endif esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); return; } /* Exercise a boundary case: zero length MSA (alen=0) */ /* Given an input *digital* MSA as a starting point, we clone it, * column subset it to zero length, then make sure that * various MSA functions operate correctly on it; * then we textize it and test it in text mode; then we * digitize it again, and throw it away. * (The input is unchanged.) */ static void utest_ZeroLengthMSA(const char *tmpfile) { char *msg = "zero length msa unit test failed"; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *mfp = NULL; ESL_MSA *z1 = NULL; ESL_MSA *z2 = NULL; ESL_MSA *z3 = NULL; int *useme = NULL; int nuseme = 0; int i; char errbuf[eslERRBUFSIZE]; /* Read a text mode alignment from the tmpfile */ if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &z1) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); /* make an alen=0 text alignment by column subsetting */ nuseme = ESL_MAX(z1->alen, z1->nseq); if ((useme = malloc(sizeof(int) * nuseme)) == NULL) esl_fatal(msg); for (i = 0; i < z1->alen; i++) useme[i] = 0; if (esl_msa_ColumnSubset(z1, errbuf, useme) != eslOK) esl_fatal(msg); /* These should all no-op if alen=0*/ if (esl_msa_MinimGaps(z1, NULL, "-", FALSE) != eslOK) esl_fatal(msg); if (esl_msa_NoGaps(z1, NULL, "-") != eslOK) esl_fatal(msg); if (esl_msa_SymConvert(z1,"RY","NN") != eslOK) esl_fatal(msg); /* test sequence subsetting by removing the first sequence */ for (i = 1; i < z1->nseq; i++) useme[i] = 1; if (esl_msa_SequenceSubset(z1, useme, &z2) != eslOK) esl_fatal(msg); esl_msa_Destroy(z1); z1 = NULL; // we may reuse z1 below. /* keep z2; we'll compare it to z3 in the end */ /* Now read the same alignment, in digital mode */ if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg); if (esl_msafile_Open(&abc, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(mfp, &z1) != eslOK) esl_fatal(msg); esl_msafile_Close(mfp); /* Now make an alen=0 alignment in digital mode */ for (i = 0; i < z1->alen; i++) useme[i] = 0; if (esl_msa_ColumnSubset(z1, errbuf, useme) != eslOK) esl_fatal(msg); /* again these should all no-op if alen=0*/ if (esl_msa_MinimGaps(z1, NULL, NULL, FALSE) != eslOK) esl_fatal(msg); if (esl_msa_NoGaps(z1, NULL, NULL) != eslOK) esl_fatal(msg); /* SymConvert throws EINVAL on a digital mode alignment */ /* test sequence subsetting by removing the first sequence */ for (i = 1; i < z1->nseq; i++) useme[i] = 1; if (esl_msa_SequenceSubset(z1, useme, &z3) != eslOK) esl_fatal(msg); esl_msa_Destroy(z1); if ((z1 = esl_msa_Clone(z3)) == NULL) esl_fatal(msg); /* z1 is now alen=0, digital */ if (esl_msa_Textize(z3) != eslOK) esl_fatal(msg); /* convert z3 back to text mode */ if (esl_msa_Compare(z2, z3) != eslOK) esl_fatal(msg); /* compare in text mode */ if (esl_msa_Digitize(abc, z2, NULL) != eslOK) esl_fatal(msg); /* now z2 is digital */ if (esl_msa_Compare(z1, z2) != eslOK) esl_fatal(msg); /* compare digital mode z1,z2 */ esl_alphabet_Destroy(abc); esl_msa_Destroy(z1); esl_msa_Destroy(z3); esl_msa_Destroy(z2); free(useme); } static void utest_Sample(ESL_RANDOMNESS *rng) { char msg[] = "esl_msa sample unit test failed"; ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); ESL_MSA *msa = NULL; if (esl_msa_Sample(rng, abc, 100, 100, &msa) != eslOK) esl_fatal(msg); if (esl_msa_Validate(msa, NULL) != eslOK) esl_fatal(msg); esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); } #endif /*eslMSA_TESTDRIVE*/ /*------------------------ end of unit tests --------------------------------*/ /***************************************************************************** * 7. Test driver *****************************************************************************/ #ifdef eslMSA_TESTDRIVE #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_keyhash.h" #include "esl_random.h" #include "esl_randomseq.h" #include "esl_ssi.h" #include "esl_msa.h" static ESL_OPTIONS options[] = { /* name type default env range togs reqs incomp help docgrp */ {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0}, {"-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0}, { 0,0,0,0,0,0,0,0,0,0}, }; static char usage[] = "[-options]"; static char banner[] = "test driver for msa module"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; FILE *fp = NULL; char tmpfile[16] = "esltmpXXXXXX"; /* tmpfile template */ ESL_ALPHABET *abc = NULL; fprintf(stderr, "## %s\n", argv[0]); fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng)); #ifdef eslTEST_THROWING esl_exception_SetHandler(&esl_nonfatal_handler); #endif /* Create a known Stockholm test alignment in a tempfile. */ if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal("failed to create tmpfile"); write_known_msa(fp); fclose(fp); /* Read it back in for use in tests. */ if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal("Failed to open MSA tmp file"); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal("Failed to read MSA tmp file"); esl_msafile_Close(mfp); /* Unit tests */ utest_Create(); utest_Sizeof(rng); utest_Destroy(); utest_Expand(); utest_SequenceSubset(msa); utest_MinimGaps(tmpfile); utest_NoGaps(tmpfile); utest_MarkFragments(); utest_SymConvert(tmpfile); utest_ZeroLengthMSA(tmpfile); utest_Sample(rng); esl_msa_Destroy(msa); if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal("alphabet creation failed"); if (esl_msafile_Open(&abc, tmpfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &mfp) != eslOK) esl_fatal("MSA digital open failed"); if (esl_msafile_Read(mfp, &msa) != eslOK) esl_fatal("MSA digital read failed"); esl_msafile_Close(mfp); utest_CreateDigital(abc); utest_Digitize(abc, tmpfile); utest_Textize(abc, tmpfile); fprintf(stderr, "# status = ok\n"); esl_msa_Destroy(msa); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); remove(tmpfile); exit(0); /* success */ } #endif /*eslMSA_TESTDRIVE*/ /*-------------------- end of test driver ---------------------*/