/* Pairwise identities, distances, and distance matrices. * * Contents: * 1. Pairwise distances for aligned text sequences. * 2. Pairwise distances for aligned digital seqs. * 3. Distance matrices for aligned text sequences. * 4. Distance matrices for aligned digital sequences. * 5. Average pairwise identity for multiple alignments. * 6. Private (static) functions. * 7. Unit tests. * 8. Test driver. * 9. Example. */ #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_dmatrix.h" #include "esl_random.h" #include "esl_distance.h" /* Forward declaration of our static functions. */ static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance); /***************************************************************** * 1. Pairwise distances for aligned text sequences. *****************************************************************/ /* Function: esl_dst_CPairId() * Synopsis: Pairwise identity of two aligned text strings. * Incept: SRE, Mon Apr 17 20:06:07 2006 [St. Louis] * * Purpose: Calculates pairwise fractional identity between two * aligned character strings and . * Return this distance in ; return the * number of identities counted in ; and * return the denominator in * . * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively for identity. Any nonalphabetic * character is assumed to be a gap symbol. * * This simple comparison rule is unaware of synonyms and * degeneracies in biological alphabets. For a more * sophisticated and biosequence-aware comparison, use * digitized sequences and the function * instead. Note that currently does * not correctly handle degeneracies, but is set up to. * * Args: as1 - aligned character string 1 * as2 - aligned character string 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: on success. , , * contain the answers (for whichever were passed non-NULL). * * Throws: if the strings are different lengths * (not aligned). */ int esl_dst_CPairId(const char *as1, const char *as2, double *opt_pid, int *opt_nid, int *opt_n) { int nid; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ int status; nid = len1 = len2 = 0; for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) { if (isalpha(as1[i])) len1++; if (isalpha(as2[i])) len2++; if (isalpha(as1[i]) && isalpha(as2[i]) && toupper(as1[i]) == toupper(as2[i])) nid++; } len1 = ESL_MIN(len1, len2); if (as1[i] != '\0' || as2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pid != NULL) *opt_pid = ( len1==0 ? 0. : (double) nid / (double) len1); if (opt_nid != NULL) *opt_nid = nid; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_pid != NULL) *opt_pid = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; } /* Function: esl_dst_CPairMatch() * Synopsis: Pairwise matches of two aligned text strings. * Incept: ER, Wed Oct 29 09:02:35 EDT 2014 [janelia] * * Purpose: Calculates pairwise fractional matches between two aligned * character strings and , in the pairHMM * sense, where a match state M is any aligned residue pair * (not necessarily an identity), and I and D are X- and -X * singlets. * * Return the fraction of matches, M / (M+I+D), in * ; return the number of matches M counted in * ; and return the denominator M+I+D, which is * , in <*opt_n>. * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively for identity. Any nonalphabetic * character is assumed to be a gap symbol. * * This simple comparison rule is unaware of synonyms and * degeneracies in biological alphabets. For a more * sophisticated and biosequence-aware comparison, use * digitized sequences and the function * instead. Note that currently does * not correctly handle degeneracies, but is set up to. * * Args: as1 - aligned character string 1 * as2 - aligned character string 2 * opt_pm - optRETURN: fraction of M states, M/(M+D+I) [0..1] * opt_nm - optRETURN: # of match states, M * opt_n - optRETURN: denominator alen - double_gaps, M+D+1 * * Returns: on success. , , * contain the answers (for whichever were passed non-NULL). * * Throws: if the strings are different lengths * (not aligned). */ int esl_dst_CPairMatch(const char *as1, const char *as2, double *opt_pm, int *opt_nm, int *opt_n) { int nm; // total matched positions int len; // length of alignment (no double gaps) int i; // position in aligned seqs int status; nm = len = 0; for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) { if (isalpha(as1[i]) || isalpha(as2[i])) len++; if (isalpha(as1[i]) && isalpha(as2[i])) nm++; } if (as1[i] != '\0' || as2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pm) *opt_pm = ( len==0 ? 0. : (double)nm / (double)len); if (opt_nm) *opt_nm = nm; if (opt_n) *opt_n = len; return eslOK; ERROR: if (opt_pm) *opt_pm = 0.; if (opt_nm) *opt_nm = 0; if (opt_n) *opt_n = 0; return status; } /* Function: esl_dst_CJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned strings. * Incept: SRE, Tue Apr 18 14:00:37 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between * two aligned character strings and , in * substitutions/site, for an alphabet of residues * ( for nucleic acid, for proteins). The * maximum likelihood estimate for the distance is * optionally returned in . The large-sample * variance for the distance estimate is * optionally returned in . * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively to count the number of identities * () and mismatches (>). Any nonalphabetic * character is assumed to be a gap symbol, and aligned * columns containing gap symbols are ignored. The * fractional difference used to calculate the * Jukes/Cantor distance is . * * Args: K - size of the alphabet (4 or 20) * as1 - 1st aligned seq, 0..L-1, \0-terminated * as2 - 2nd aligned seq, 0..L-1, \0-terminated * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: on success. * * Infinite distances are possible, in which case distance * and variance are both . Caller has to deal * with this case as it sees fit, perhaps by enforcing * an arbitrary maximum distance. * * Throws: if the two strings aren't the same length (and * thus can't have been properly aligned). * if no aligned residues were counted. * On either failure, distance and variance are both returned * as . */ int esl_dst_CJukesCantor(int K, const char *as1, const char *as2, double *opt_distance, double *opt_variance) { int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ int status; n1 = n2 = 0; for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) { if (isalpha(as1[i]) && isalpha(as2[i])) { if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++; } } if (as1[i] != '\0' || as2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */ ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; } /*------- end, pairwise distances for aligned text seqs ---------*/ /***************************************************************** * 2. Pairwise distances for aligned digitized sequences. *****************************************************************/ /* Function: esl_dst_XPairId() * Synopsis: Pairwise identity of two aligned digital seqs. * Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis] * * Purpose: Digital version of : and * are digitized aligned sequences, in alphabet * . * * Only exactly matching codes count as identities; * canonical residues, of course, but also IUPAC degeneracy * codes. (YY is an identity; YC is not.) It would be more * sophisticated to use to handle * degeneracies, but doing that would require that * be changed to an expectation, and a double. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: on success. , , * contain the answers, for any of these that were passed * non- pointers. * * Throws: if the strings are different lengths (not aligned). */ int esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_pid, int *opt_nid, int *opt_n) { int nid; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ int status; nid = len1 = len2 = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsResidue(abc, ax1[i])) len1++; if (esl_abc_XIsResidue(abc, ax2[i])) len2++; if (esl_abc_XIsResidue(abc, ax1[i]) && esl_abc_XIsResidue(abc, ax2[i]) && ax1[i] == ax2[i]) nid++; // IUPAC degen only counts as identity if code exactly matches } len1 = ESL_MIN(len1, len2); if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pid) *opt_pid = ( len1==0 ? 0. : (double) nid / (double) len1 ); if (opt_nid) *opt_nid = nid; if (opt_n) *opt_n = len1; return eslOK; ERROR: if (opt_pid) *opt_pid = 0.; if (opt_nid) *opt_nid = 0; if (opt_n) *opt_n = 0; return status; } /* Function: esl_dst_XPairMatch() * Synopsis: Pairwise matches of two aligned digital seqs. * Incept: ER, Wed Oct 29 09:09:07 EDT 2014 [janelia] * * Purpose: Digital version of : and * are digitized aligned sequences, in alphabet * . * * IUPAC degeneracy codes count as residues both in * counting match (XX residue/residue) and delete/insert * (X-, -X) states. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pm - optRETURN: pairwise fractional match states, M/(M+D+I), 0<=x<=1 * opt_nm - optRETURN: # of match states, M * opt_n - optRETURN: denominator alen-double_gaps, (M+D+I) * * Returns: on success. , , * contain the answers, for any of these that were passed * non- pointers. * * Throws: if the strings are different lengths (not aligned). */ int esl_dst_XPairMatch(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_pm, int *opt_nm, int *opt_n) { int nm; // total matched positions int len; // length of alignment (no double gaps) int i; // position in aligned seqs int status; nm = len = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsResidue(abc, ax1[i]) || esl_abc_XIsResidue(abc, ax2[i])) len++; if (esl_abc_XIsResidue(abc, ax1[i]) && esl_abc_XIsResidue(abc, ax2[i])) nm++; } if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pm) *opt_pm = ( len==0 ? 0. : (double)nm / (double)len ); if (opt_nm) *opt_nm = nm; if (opt_n) *opt_n = len; return eslOK; ERROR: if (opt_pm) *opt_pm = 0.; if (opt_nm) *opt_nm = 0; if (opt_n) *opt_n = 0; return status; } /* Function: esl_dst_XJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned digitized seqs. * Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between two * aligned digital strings and , in substitutions/site, * using alphabet to evaluate identities and differences. * The maximum likelihood estimate for the distance is optionally returned in * . The large-sample variance for the distance * estimate is optionally returned in . * * Identical to , except that it takes * digital sequences instead of character strings. * * Args: abc - bioalphabet to use for comparisons * ax1 - 1st digital aligned seq * ax2 - 2nd digital aligned seq * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: on success. As in , the * distance and variance may be infinite, in which case they * are returned as . * * Throws: if the two strings aren't the same length (and * thus can't have been properly aligned). * if no aligned residues were counted. * On either failure, the distance and variance are set * to . */ int esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_distance, double *opt_variance) { int n1, n2; // number of observed identities, substitutions int i; // position in aligned seqs int status; n1 = n2 = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])) { if (ax1[i] == ax2[i]) n1++; else n2++; } } if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, abc->K, opt_distance, opt_variance); ERROR: if (opt_distance) *opt_distance = HUGE_VAL; if (opt_variance) *opt_variance = HUGE_VAL; return status; } /*---------- end pairwise distances, digital seqs --------------*/ /***************************************************************** * 3. Distance matrices for aligned text sequences. *****************************************************************/ /* Function: esl_dst_CPairIdMx() * Synopsis: NxN identity matrix for N aligned text sequences. * Incept: SRE, Thu Apr 27 08:46:08 2006 [New York] * * Purpose: Given a multiple sequence alignment , consisting * of aligned character strings; calculate * a symmetric fractional pairwise identity matrix by $N(N-1)/2$ * calls to , and return it in * . * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * ret_S - RETURN: symmetric fractional identity matrix * * Returns: on success, and contains the fractional * identity matrix. Caller free's with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **opt_S) { ESL_DMATRIX *S = NULL; int status; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (opt_S) *opt_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: esl_dmatrix_Destroy(S); if (opt_S) *opt_S = NULL; return status; } /* Function: esl_dst_CDiffMx() * Synopsis: NxN difference matrix for N aligned text sequences. * Incept: SRE, Fri Apr 28 06:27:20 2006 [New York] * * Purpose: Same as , but calculates * the fractional difference instead of the * fractional identity for each pair. * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * opt_D - RETURN: symmetric fractional difference matrix * * Returns: on success, and contains the * fractional difference matrix. Caller free's with * . * * Throws: if any seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. */ int esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **opt_D) { ESL_DMATRIX *D = NULL; int status; int i,j; if ((status = esl_dst_CPairIdMx(as, N, &D)) != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (opt_D) *opt_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: esl_dmatrix_Destroy(D); if (opt_D) *opt_D = NULL; return status; } /* Function: esl_dst_CJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned text seqs. * Incept: SRE, Tue Apr 18 16:00:16 2006 [St. Louis] * * Purpose: Given a multiple sequence alignment , consisting of * aligned character sequences in an alphabet of * letters (usually 4 for DNA, 20 for protein); * calculate a symmetric Jukes/Cantor pairwise distance * matrix for all sequence pairs, and optionally return the distance * matrix in , and optionally return a symmetric matrix of the * large-sample variances for those ML distance estimates * in . * * Infinite distances (and variances) are possible; they * are represented as in and . Caller must * be prepared to deal with them as appropriate. * * Args: K - size of the alphabet (usually 4 or 20) * as - aligned sequences [0.nseq-1][0..L-1] * N - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: on success. and contain the * distance matrix (and variances); caller frees these with * . * * Throws: if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * if some pair of sequences had no aligned * residues. On failure, and are both returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_CJukesCantorMx(int K, char **as, int N, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int i,j; int status; if (( D = esl_dmatrix_Create(N, N) ) == NULL) { status = eslEMEM; goto ERROR; } if (( V = esl_dmatrix_Create(N, N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < N; j++) { if ((status = esl_dst_CJukesCantor(K, as[i], as[j], &(D->mx[i][j]), &(V->mx[i][j]))) != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at seqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(V); if (opt_D) *opt_D = NULL; if (opt_V) *opt_V = NULL; return status; } /*----------- end, distance matrices for aligned text seqs ---------*/ /***************************************************************** * 4. Distance matrices for aligned digital sequences. *****************************************************************/ /* Function: esl_dst_XPairIdMx() * Synopsis: NxN identity matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York] * * Purpose: Given a digitized multiple sequence alignment , consisting * of aligned digital sequences in alphabet ; calculate * a symmetric pairwise fractional identity matrix by $N(N-1)/2$ * calls to , and return it in . * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * opt_S - RETURN: NxN matrix of fractional identities * * Returns: on success, and contains the distance * matrix. Caller is obligated to free with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **opt_S) { ESL_DMATRIX *S = NULL; int i,j; int status; if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (opt_S) *opt_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: esl_dmatrix_Destroy(S); if (opt_S) *opt_S = NULL; return status; } /* Function: esl_dst_XDiffMx() * Synopsis: NxN difference matrix for N aligned digital seqs. * Incept: SRE, Fri Apr 28 06:37:29 2006 [New York] * * Purpose: Same as , but calculates fractional * difference <1-s> instead of fractional identity for * each pair. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * opt_D - RETURN: NxN matrix of fractional differences * * Returns: on success, and contains the difference * matrix; caller is obligated to free with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. */ int esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **opt_D) { ESL_DMATRIX *D = NULL; int i,j; int status; if ((status = esl_dst_XPairIdMx(abc, ax, N, &D)) != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (opt_D) *opt_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: esl_dmatrix_Destroy(D); if (opt_D) *opt_D = NULL; return status; } /* Function: esl_dst_XJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City] * * Purpose: Given a digitized multiple sequence alignment , * consisting of aligned digital sequences in * bioalphabet , calculate a symmetric Jukes/Cantor * pairwise distance matrix for all sequence pairs; * optionally return the distance matrix in and * a matrix of the large-sample variances for those ML distance * estimates in . * * Infinite distances (and variances) are possible. They * are represented as in and . Caller must * be prepared to deal with them as appropriate. * * Args: abc - bioalphabet for * ax - aligned digital sequences [0.nseq-1][1..L] * N - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: on success. and , if provided, contain the * distance matrix and variances. Caller frees these with * . * * Throws: if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * if some pair of sequences had no aligned * residues. On failure, and are both returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int status; int i,j; if (( D = esl_dmatrix_Create(N, N) ) == NULL) { status = eslEMEM; goto ERROR; } if (( V = esl_dmatrix_Create(N, N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < N; j++) { if ((status = esl_dst_XJukesCantor(abc, ax[i], ax[j], &(D->mx[i][j]), &(V->mx[i][j]))) != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(V); if (opt_D) *opt_D = NULL; if (opt_V) *opt_V = NULL; return status; } /*------- end, distance matrices for digital alignments ---------*/ /***************************************************************** * 5. Average pairwise identity for multiple alignments *****************************************************************/ /* Function: esl_dst_CAverageId() * Synopsis: Calculate avg identity for multiple alignment * Incept: SRE, Fri May 18 15:02:38 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a multiple sequence alignment , consisting of * aligned character sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*opt_avgid> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*opt_avgid> is set to 0. */ int esl_dst_CAverageId(char **as, int N, int max_comparisons, double *opt_avgid) { ESL_RANDOMNESS *rng = NULL; double avgid = 0.; double id; int i,j,k; int status; if (N <= 1) // by convention, with no pairwise comparisons for N=0|1, set pid = 1.0 { avgid = 1.0; } else if (N <= max_comparisons && // if N is small enough that we can average exhaustively over all pairwise comparisons... N <= sqrt(2. * max_comparisons) && // (beware numerical overflow of N^2) (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; avgid += id; } avgid /= (double) (N * (N-1) / 2); } else // If nseq is large, calculate average over a stochastic sample. { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, N); j = esl_rnd_Roll(rng, N); } while (j == i); // make sure j != i if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; avgid += id; } avgid /= (double) max_comparisons; } esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = avgid; return eslOK; ERROR: esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = 0.; return status; } /* Function: esl_dst_CAverageMatch() * Synopsis: Calculate avg matches for multiple alignment * Incept: ER, Wed Oct 29 09:25:09 EDT 2014 [Janelia] * * Purpose: Calculates the average pairwise fractional matches M/(M+D+I) in * a multiple sequence alignment , consisting of * aligned character sequences of identical length. M,D,I * are in the pair-HMM state sense: M means any aligned pair, * including both mismatches and identities. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise matches (range $[0..$ pm $..1]$ * is calculated using . * * Returns: on success, and <*ret_avgpm> contains the average * fractional matches. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_avgpm> is set to 0. */ int esl_dst_CAverageMatch(char **as, int N, int max_comparisons, double *opt_avgpm) { ESL_RANDOMNESS *rng = NULL; double avgpm = 0.; double pmatch; int i,j,k; int status; if (N <= 1) // Edge case: a single sequence by itself has no pairwise comparisons { // Set a convention of id = 1.0 for the case of no pairwise comparisons avgpm = 1.0; } else if (N <= max_comparisons && // Is N small enough that we can average over all pairwise comparisons? N <= sqrt(2. * max_comparisons) && // Watch out for numerical overflow in this: for large MSAs, N(N-1)/2 can overflow. (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairMatch(as[i], as[j], &pmatch, NULL, NULL)) != eslOK) return status; avgpm += pmatch; } avgpm /= (double) (N * (N-1) / 2); } else /* If nseq is large, calculate average over a stochastic sample. */ { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, N); j = esl_rnd_Roll(rng, N); } while (j == i); // make sure j != i if ((status = esl_dst_CPairMatch(as[i], as[j], &pmatch, NULL, NULL)) != eslOK) return status; avgpm += pmatch; } avgpm /= (double) max_comparisons; } esl_randomness_Destroy(rng); if (opt_avgpm) *opt_avgpm = avgpm; return eslOK; ERROR: esl_randomness_Destroy(rng); if (opt_avgpm) *opt_avgpm = 0.; return status; } /* Function: esl_dst_XAverageId() * Synopsis: Calculate avg identity for digital MSA * Incept: SRE, Fri May 18 15:19:14 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a digital multiple sequence alignment , consisting of * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*ret_id> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *opt_avgid) { ESL_RANDOMNESS *rng = NULL; double avgid = 0.; double id; int i,j,k; int status; if (N <= 1) // Edge case: a single sequence by itself has no pairwise comparisons { // Set a convention of id = 1.0 for the case of no pairwise comparisons avgid = 1.; } else if (N <= max_comparisons && // Is N small enough that we can average over all pairwise comparisons? N <= sqrt(2. * max_comparisons) && // Watch out for numerical overflow in this: for large MSAs, N(N-1)/2 can overflow. (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; avgid += id; } avgid /= (double) (N * (N-1) / 2); } else /* If nseq is large, calculate average over a stochastic sample. */ { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, N); j = esl_rnd_Roll(rng, N); } while (j == i); // make sure j != i if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; avgid += id; } avgid /= (double) max_comparisons; } if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = avgid; return eslOK; ERROR: if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = 0.; return status; } /* Function: esl_dst_XAverageMatch() * Synopsis: Calculate avg matches for digital MSA * Incept: ER, ed Oct 29 09:29:05 EDT 2014 [Janelia] * * Purpose: Calculates the average pairwise fractional matches in * a digital multiple sequence alignment , consisting of * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise matches (range $[0..$ pm $..1]$ * is calculated using . * * Returns: on success, and <*opt_avgpm> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_avgpm> is set to 0. */ int esl_dst_XAverageMatch(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *opt_avgpm) { ESL_RANDOMNESS *rng = NULL; double avgpm = 0.; double pm; int i,j,k; int status; if (N <= 1) // Edge case: a single sequence by itself has no pairwise comparisons { // Set a convention of pm = 1.0 for the case of no pairwise comparisons avgpm = 1.; } else if (N <= max_comparisons && // Is N small enough that we can average over all pairwise comparisons? N <= sqrt(2. * max_comparisons) && // Watch out for numerical overflow in this: for large MSAs, N(N-1)/2 can overflow. (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &pm, NULL, NULL)) != eslOK) return status; avgpm += pm; } avgpm /= (double) (N * (N-1) / 2); } else /* If nseq is large, calculate average over a stochastic sample. */ { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, N); j = esl_rnd_Roll(rng, N); } while (j == i); // make sure j != i if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &pm, NULL, NULL)) != eslOK) return status; avgpm += pm; } avgpm /= (double) max_comparisons; } if (rng) esl_randomness_Destroy(rng); if (opt_avgpm) *opt_avgpm = avgpm; return eslOK; ERROR: if (rng) esl_randomness_Destroy(rng); if (opt_avgpm) *opt_avgpm = 0.; return status; } /* Function: esl_dst_XAvgConnectivity() * Synopsis: Calculate both avg identity and connectivity for digital MSA * Incept: SRE, Sat 07 May 2022 * * Purpose: Calculates average connectivity: the fraction of sequence * pairs with fractional pairwise identity > * . Also calculates average fractional pairwise * identity, since it's efficient to calculate avg id and * connectivity at the same time. The input is a digital * multiple sequence alignment of sequences in * alphabet . * * As in , if an exhaustive calculation * would require more than pairwise * comparisons, then instead of looking at all pairs, * calculate the average over a stochastic sample of * random pairs. This allows the routine * to work efficiently even on very deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using . The calculation * is # of identities / unaligned length of shorter seq. * * Only makes sense for N > 1, so that there are pairwise * comparisons to make; but we may encounter N = 1 or N = 0 * as edge cases of single sequence or empty alignments. * In this case, avgid and avgconn are set to 1.0 by * convention. * * Args: abc - digital alphabet * ax - array of digital aligned seqs * N - number of seqs in * max_comparisons - switch to a stochastic sample if N(N-1)/2 > this * idthresh - count pairs as linked when they have identity > this threshold [0..1] * opt_avgid - optRETURN: average identity [0..1] * opt_avgconn - optRETURN: average connectivity [0..1] * * Returns: on success, and <*opt_avgid> and <*opt_avgconn>, if * provided, contain avgid and avgconn. * * Throws: on allocation failure; * if any aligned seq pair aren't of same length. * <*opt_avgid> and <*opt_avgconn> (if provided) are set to 0. * * Xref: Adapted from a previous version, esl_dst_Connectivity(), from * Sam Petti in Sept 2020. Where we calculate connectivity * (e.g. in profmark), we also need to calc avg identity, * and it's more efficient to calculate both at once. */ int esl_dst_XAvgConnectivity(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double idthresh, double *opt_avgid, double *opt_avgconn) { ESL_RANDOMNESS *rng = NULL; double avgid = 0.; double avgconn = 0.; double id; int i,j,k; int status; if (N <= 1) // Edge case: a single sequence by itself has no pairwise comparisons { // Set a convention of id = 1.0 and conn = 1.0 for the case of no pairwise comparisons avgid = avgconn = 1.; } else if (N <= max_comparisons && // Is N small enough that we can average over all pairwise comparisons? N <= sqrt(2. * max_comparisons) && // Watch out for numerical overflow in this: for large MSAs, N(N-1)/2 can overflow. (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) goto ERROR; if (id > idthresh) avgconn += 1.; avgid += id; } avgid /= (double) (N * (N-1) / 2); avgconn /= (double) (N * (N-1) / 2); } else // for large N, calculate averages over a reproducible stochastic sample. { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, N); j = esl_rnd_Roll(rng, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) goto ERROR; if (id > idthresh) avgconn += 1.; avgid += id; } avgid /= (double) max_comparisons; avgconn /= (double) max_comparisons; } if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = avgid; if (opt_avgconn) *opt_avgconn = avgconn; return eslOK; ERROR: if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = 0.; if (opt_avgconn) *opt_avgconn = 0.; return status; } /* Function: esl_dst_XAvgSubsetConnectivity() * Synopsis: Like esl_dst_XAvgSubsetConnectivity(), but for a subset of the MSA * Incept: SRE, Thu 12 May 2022 * * Purpose: Calculate average identity and average connectivity as in * , for a subset of the sequences * in the input . The subset is defined by an array of * sequence indices : . * * (This was written for HMMER's create-profmark, which * does a bunch of manipulation of MSA subsets, and for * efficiency reasons, it doesn't explicitly extract new * MSAs for them.) * * Args: abc - digital alphabet * ax - array of digital aligned seqs * N - number of seqs in * V - array of seq indices in the subset; V[0..nV-1] = 0..nseq-1 * nV - number of seqs in the subset; nV <= N * max_comparisons - switch to a stochastic sample if N(N-1)/2 > this * idthresh - count pairs as linked when they have identity > this threshold [0..1] * opt_avgid - optRETURN: average identity [0..1] * opt_avgconn - optRETURN: average connectivity [0..1] * * Returns: on success, and <*opt_avgid> and <*opt_avgconn>, if * provided, contain avgid and avgconn. * * Throws: on allocation failure; * if any aligned seq pair aren't of same length. * <*opt_avgid> and <*opt_avgconn> (if provided) are set to 0. */ int esl_dst_XAvgSubsetConnectivity(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int *V, int nV, int max_comparisons, double idthresh, double *opt_avgid, double *opt_avgconn) { ESL_RANDOMNESS *rng = NULL; double avgid = 0.; double avgconn = 0.; double id; int i,j,k; int status; ESL_DASSERT1(( nV <= N )); // indices in V[0..nV-1] are a subset of 0..N-1 if (nV <= 1) // Edge case: a single sequence by itself has no pairwise comparisons { // Set a convention of id = 1.0 and conn = 1.0 for the case of no pairwise comparisons avgid = avgconn = 1.; } else if (nV <= max_comparisons && // Is nV small enough that we can average over all pairwise comparisons? nV <= sqrt(2. * max_comparisons) && // Watch out for numerical overflow in this: for large MSAs, nV(nV-1)/2 can overflow. (nV * (nV-1) / 2) <= max_comparisons) { for (i = 0; i < nV; i++) for (j = i+1; j < nV; j++) { ESL_DASSERT1(( V[i] >= 0 && V[i] <= N && V[j] >= 0 && V[j] <= N )); if ((status = esl_dst_XPairId(abc, ax[V[i]], ax[V[j]], &id, NULL, NULL)) != eslOK) goto ERROR; if (id > idthresh) avgconn += 1.; avgid += id; } avgid /= (double) (nV * (nV-1) / 2); avgconn /= (double) (nV * (nV-1) / 2); } else // for large nV, calculate averages over a reproducible stochastic sample. { if (( rng = esl_randomness_Create(42) ) == NULL) { status = eslEMEM; goto ERROR; } // fixed seed, suppress stochastic variation for (k = 0; k < max_comparisons; k++) { do { i = esl_rnd_Roll(rng, nV); j = esl_rnd_Roll(rng, nV); } while (j == i); /* make sure j != i */ ESL_DASSERT1(( V[i] >= 0 && V[i] <= N && V[j] >= 0 && V[j] <= N )); if ((status = esl_dst_XPairId(abc, ax[V[i]], ax[V[j]], &id, NULL, NULL)) != eslOK) goto ERROR; if (id > idthresh) avgconn += 1.; avgid += id; } avgid /= (double) max_comparisons; avgconn /= (double) max_comparisons; } if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = avgid; if (opt_avgconn) *opt_avgconn = avgconn; return eslOK; ERROR: if (rng) esl_randomness_Destroy(rng); if (opt_avgid) *opt_avgid = 0.; if (opt_avgconn) *opt_avgconn = 0.; return status; } /***************************************************************** * 6. Private (static) functions *****************************************************************/ /* jukescantor() * * The generalized Jukes/Cantor distance calculation. * Given identities and differences, for a * base alphabet size of (4 or 20); * calculate J/C distance in substitutions/site and * return it in ; calculate large-sample * variance and return it in . * * Returns if there are no data (). */ static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance) { int status; double D, K, N; double x; double distance, variance; ESL_DASSERT1( (n1 >= 0) ); ESL_DASSERT1( (n2 >= 0) ); ESL_DASSERT1( (alphabet_size >= 0) ); if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; } K = (double) alphabet_size; D = (double) n2 / (double) (n1+n2); N = (double) (n1+n2); x = 1. - D * K/(K-1.); if (x <= 0.) { distance = HUGE_VAL; variance = HUGE_VAL; } else { distance = -log(x) * K/(K-1); variance = exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N; } if (opt_distance != NULL) *opt_distance = distance; if (opt_variance != NULL) *opt_variance = variance; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; } /*--------------- end of private functions ----------------------*/ /***************************************************************** * 7. Unit tests. *****************************************************************/ #ifdef eslDISTANCE_TESTDRIVE #include "esl_msa.h" #include "esl_msafile.h" #include "esl_vectorops.h" /* Each unit test has access to the same small test alignment * and predetermined pairwise identities and # of matches. * * At least one comparison should be 0% identity, to exercise one of * the cases in jukes-cantor distance calculations. Here, seq0/seq2. * * The alignment contains only canonical residues, because one of our * tests is that C and X functions give the same results. * * You can change the alignment and the associated static data here. * The unit tests don't make any assumptions of their own about the * example alignment. */ static char testmsa[] = "\ # STOCKHOLM 1.0\n\ \n\ seq0 .AAAA.AAAA\n\ seq1 ...AA.AC..\n\ seq2 .CCCC.CCCC\n\ seq3 ....A.AAC.\n\ //"; static double true_pid[4][4] = { { 1.0, 0.75, 0.0, 0.75 }, { 0.75, 1.0, 0.25, 0.5 }, { 0.0, 0.25, 1.0, 0.25 }, { 0.75, 0.5, 0.25, 1.0 } }; static int true_nid[4][4] = { { 8, 3, 0, 3 }, { 3, 4, 1, 2 }, { 0, 1, 8, 1 }, { 3, 2, 1, 4 } }; static int true_len[4] = { 8, 4, 8, 4 }; static int true_M[4][4] = { { 8, 4, 8, 4 }, { 4, 4, 4, 3 }, { 8, 4, 8, 4 }, { 4, 3, 4, 4 } }; static int true_MDI[4][4] = { { 8, 8, 8, 8 }, { 8, 4, 8, 5 }, { 8, 8, 8, 8 }, { 8, 5, 8, 4 } }; static void create_testmsa(ESL_ALPHABET **ret_abc, ESL_MSA **ret_cmsa, ESL_MSA **ret_xmsa) { ESL_ALPHABET *abc = esl_alphabet_Create(eslDNA); ESL_MSA *cmsa = esl_msa_CreateFromString(testmsa, eslMSAFILE_STOCKHOLM); ESL_MSA *xmsa = esl_msa_Clone(cmsa); char errmsg[eslERRBUFSIZE]; if ( esl_msa_Digitize(abc, xmsa, errmsg) != eslOK) esl_fatal("esl_distance create_testmsa failed:\n%s\n", errmsg); *ret_abc = abc; *ret_cmsa = cmsa; *ret_xmsa = xmsa; } static void utest_pairs(const ESL_ALPHABET *abc, ESL_DSQ **ax, char **as, int N) { char msg[] = "esl_distance::utest_pairs failed"; double pid_c, pid_x, mpid_c, mpid_x; int nid_x, nid_c, mnid_c, mnid_x; int nres_x, nres_c, mn_c, mn_x; double jcd_x, jcd_c; double jcv_x, jcv_c; int i,j; for (i = 0; i < N; i++) for (j = 0; j < N; j++) { if ( esl_dst_CPairId ( as[i], as[j], &pid_c, &nid_c, &nres_c) != eslOK) esl_fatal(msg); if ( esl_dst_CPairMatch ( as[i], as[j], &mpid_c, &mnid_c, &mn_c) != eslOK) esl_fatal(msg); if ( esl_dst_CJukesCantor(abc->K, as[i], as[j], &jcd_c, &jcv_c) != eslOK) esl_fatal(msg); if ( esl_dst_XPairId (abc, ax[i], ax[j], &pid_x, &nid_x, &nres_x) != eslOK) esl_fatal(msg); if ( esl_dst_XPairMatch (abc, ax[i], ax[j], &mpid_x, &mnid_x, &mn_x) != eslOK) esl_fatal(msg); if ( esl_dst_XJukesCantor(abc, ax[i], ax[j], &jcd_x, &jcv_x) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_pid[i][j], pid_c, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_pid[i][j], pid_x, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( nid_c != true_nid[i][j] ) esl_fatal(msg); if ( nid_x != true_nid[i][j] ) esl_fatal(msg); if ( nres_c != ESL_MIN(true_len[i], true_len[j]) ) esl_fatal(msg); if ( nres_x != ESL_MIN(true_len[i], true_len[j]) ) esl_fatal(msg); if ( mnid_c != true_M[i][j] ) esl_fatal(msg); if ( mnid_x != true_M[i][j] ) esl_fatal(msg); if ( mn_c != true_MDI[i][j] ) esl_fatal(msg); if ( mn_x != true_MDI[i][j] ) esl_fatal(msg); if ( esl_DCompare( (double) true_M[i][j] / (double) true_MDI[i][j], mpid_c, 1e-4, 1e-4) != eslOK) esl_fatal(msg); if ( esl_DCompare( (double) true_M[i][j] / (double) true_MDI[i][j], mpid_x, 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (true_pid[i][j] == 1.0 && (jcd_c != 0.0 || jcd_x != 0.0)) esl_fatal(msg); // 100% identity, including self-comparisons, give distance = 0. if (true_pid[i][j] < 1./(double)abc->K && (jcd_c != HUGE_VAL || jcd_x != HUGE_VAL)) esl_fatal(msg); // <1/K pid gives infinite distance (HUGE_VAL) } /* API accepts NULL for return values */ if (esl_dst_CPairId ( as[0], as[0], NULL, NULL, NULL) != eslOK) esl_fatal(msg); if (esl_dst_CPairMatch ( as[0], as[0], NULL, NULL, NULL) != eslOK) esl_fatal(msg); if (esl_dst_CJukesCantor(abc->K, as[0], as[0], NULL, NULL) != eslOK) esl_fatal(msg); if (esl_dst_XPairId (abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) esl_fatal(msg); if (esl_dst_XPairMatch (abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) esl_fatal(msg); if (esl_dst_XJukesCantor(abc, ax[0], ax[0], NULL, NULL) != eslOK) esl_fatal(msg); } static void utest_matrices(const ESL_ALPHABET *abc, ESL_DSQ **ax, char **as, int N) { char msg[] = "esl_distance::utest_matrices failed"; ESL_DMATRIX *id_c = NULL; ESL_DMATRIX *id_x = NULL; ESL_DMATRIX *diff_c = NULL; ESL_DMATRIX *diff_x = NULL; ESL_DMATRIX *jcd_c = NULL; ESL_DMATRIX *jcd_x = NULL; ESL_DMATRIX *jcV_c = NULL; ESL_DMATRIX *jcV_x = NULL; int i,j; if ( esl_dst_CPairIdMx ( as, N, &id_c) != eslOK) esl_fatal(msg); if ( esl_dst_CDiffMx ( as, N, &diff_c) != eslOK) esl_fatal(msg); if ( esl_dst_CJukesCantorMx(abc->K, as, N, &jcd_c, &jcV_c) != eslOK) esl_fatal(msg); if ( esl_dst_XPairIdMx (abc, ax, N, &id_x) != eslOK) esl_fatal(msg); if ( esl_dst_XDiffMx (abc, ax, N, &diff_x) != eslOK) esl_fatal(msg); if ( esl_dst_XJukesCantorMx(abc, ax, N, &jcd_x, &jcV_x) != eslOK) esl_fatal(msg); for (i = 0; i < N; i++) for (j = 0; j < N; j++) { if (esl_DCompare( true_pid[i][j], id_c->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (esl_DCompare( true_pid[i][j], id_x->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (esl_DCompare(1.-true_pid[i][j], diff_c->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (esl_DCompare(1.-true_pid[i][j], diff_x->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (esl_DCompare( jcd_c->mx[i][j], jcd_x->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); if (esl_DCompare( jcV_c->mx[i][j], jcV_x->mx[i][j], 1e-4, 1e-4) != eslOK) esl_fatal(msg); } /* API accepts NULL for return values */ if ( esl_dst_CPairIdMx ( as, N, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_CDiffMx ( as, N, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_CJukesCantorMx(abc->K, as, N, NULL, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XPairIdMx (abc, ax, N, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XDiffMx (abc, ax, N, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XJukesCantorMx(abc, ax, N, NULL, NULL) != eslOK) esl_fatal(msg); esl_dmatrix_Destroy(id_c); esl_dmatrix_Destroy(id_x); esl_dmatrix_Destroy(diff_c); esl_dmatrix_Destroy(diff_x); esl_dmatrix_Destroy(jcd_c); esl_dmatrix_Destroy(jcd_x); esl_dmatrix_Destroy(jcV_c); esl_dmatrix_Destroy(jcV_x); } static void utest_averages(const ESL_ALPHABET *abc, ESL_DSQ **ax, char **as, int N) { char msg[] = "esl_distance::utest_averages failed"; double idthresh1 = 0.70; // sort of tuned to the current example MSA: 2/6 double idthresh2 = 0.20; // ... and 5/6. But code will work with any MSA. double true_avgid = 0.; double true_avgid2 = 0.; // by Elena's denominator using the AverageMatch() functions double true_conn1 = 0.; double true_conn2 = 0.; double np = 0.; int i,j; double avgid, avgconn; for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { true_avgid += true_pid[i][j]; true_avgid2 += (double) true_M[i][j] / (double) true_MDI[i][j]; if (true_pid[i][j] > idthresh1) true_conn1 += 1.; if (true_pid[i][j] > idthresh2) true_conn2 += 1.; np += 1.; } true_avgid /= np; true_avgid2 /= np; true_conn1 /= np; true_conn2 /= np; if ( esl_dst_CAverageId(as, N, 10000, &avgid) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageMatch(as, N, 10000, &avgid) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid2, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageId(abc, ax, N, 10000, &avgid) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageMatch(abc, ax, N, 10000, &avgid) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid2, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_dst_XAvgConnectivity(abc, ax, N, 10000, idthresh1, &avgid, &avgconn) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_conn1, avgconn, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_dst_XAvgConnectivity(abc, ax, N, 10000, idthresh2, &avgid, &avgconn) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_avgid, avgid, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); if ( esl_DCompare( true_conn2, avgconn, 1e-4, 1e-4 ) != eslOK) esl_fatal(msg); /* Edge case for zero connectivity (because links are defined as strictly > idthresh, at idthresh = 1.0 there are no links) */ if ( esl_dst_XAvgConnectivity(abc, ax, N, 10000, 1.0, NULL, &avgconn) != eslOK || avgconn != 0.0) esl_fatal(msg); /* API accepts NULL for return values */ if ( esl_dst_CAverageId ( as, N, 10000, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageMatch ( as, N, 10000, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageId (abc, ax, N, 10000, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageMatch (abc, ax, N, 10000, NULL) != eslOK) esl_fatal(msg); if ( esl_dst_XAvgConnectivity(abc, ax, N, 10000, idthresh2, NULL, NULL) != eslOK) esl_fatal(msg); } /* Test concordance between stochastic sampling estimate vs. exhaustive calculation * of avg identity, connectivity. * */ static void utest_sampling(ESL_RANDOMNESS *rng, const ESL_ALPHABET *abc, int allow_badluck) { char msg[] = "esl_distance::utest_sampling failed"; ESL_MSA *msa = NULL; int max_nseq = 400; // 1..400, so avg nseq = ~200; typically ~20K pairs; up to 79.8K int max_alen = 10; double avgid1, avgid2, avgid3, avgid4; double avgpm1, avgpm2, avgpm3, avgpm4; double avgconn1, avgconn2; if (! allow_badluck) esl_randomness_Init(rng, 4); // reinit to a fixed seed by default: less stringent test, but guarantee that it should succeed. // this seed was chosen to give nseq=320, thus exercising sampling vs. exhaustive if ( esl_msa_Sample(rng, abc, max_nseq, max_alen, &msa) != eslOK) esl_fatal(msg); if (! allow_badluck && msa->nseq != 320) esl_fatal(msg); // trap here, if someone ever changes esl_msa_Sample() we need to recheck this test if ( esl_dst_XAvgConnectivity(abc, msa->ax, msa->nseq, 10000, 0.30, &avgid1, &avgconn1) != eslOK) esl_fatal(msg); // will often (but not always) use sampling if ( esl_dst_XAvgConnectivity(abc, msa->ax, msa->nseq, 80000, 0.30, &avgid2, &avgconn2) != eslOK) esl_fatal(msg); // will always be exhaustive if ( esl_DCompare(avgid2, avgid1, 1e-4, 0.01) != eslOK) esl_fatal(msg); // absolute tolerance +/- 0.01. Default gives 0.229, 0.231 if ( esl_DCompare(avgconn2, avgconn1, 1e-4, 0.01) != eslOK) esl_fatal(msg); // ... 0.268, 0.273 if ( esl_dst_XAverageId(abc, msa->ax, msa->nseq, 10000, &avgid1) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageId(abc, msa->ax, msa->nseq, 80000, &avgid2) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgid2, avgid1, 1e-4, 0.01) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageMatch(abc, msa->ax, msa->nseq, 10000, &avgpm1) != eslOK) esl_fatal(msg); if ( esl_dst_XAverageMatch(abc, msa->ax, msa->nseq, 80000, &avgpm2) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgpm2, avgpm1, 1e-4, 0.01) != eslOK) esl_fatal(msg); // 0.825, 0.824 /* Convert MSA to text mode. We get the same avgid, avgpm in text vs digital mode of the same MSA. */ if ( esl_msa_Textize(msa) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageId(msa->aseq, msa->nseq, 10000, &avgid3) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageId(msa->aseq, msa->nseq, 80000, &avgid4) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgid4, avgid3, 1e-4, 0.01) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgid4, avgid2, 1e-4, 0.01) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageMatch(msa->aseq, msa->nseq, 10000, &avgpm3) != eslOK) esl_fatal(msg); if ( esl_dst_CAverageMatch(msa->aseq, msa->nseq, 80000, &avgpm4) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgpm4, avgpm3, 1e-4, 0.01) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgpm4, avgpm2, 1e-4, 0.01) != eslOK) esl_fatal(msg); esl_msa_Destroy(msa); } /* Unit test for esl_dst_XAvgSubsetConnectivity() * * For a randomly sampled MSA, take a random subset, and check that we * get the same avg id and avg connectivity whether we use * esl_dst_AvgSubsetConnectivity() the efficient way, or if we extract * the subset to a new MSA and use esl_dst_AvgConnectivity(). */ static void utest_subset_connectivity(ESL_RANDOMNESS *rng, const ESL_ALPHABET *abc) { char msg[] = "esl_distance::utest_subset_connectivity failed"; int max_nseq = 100; // you don't want to set this high, but if you do, not so high that max_nseq*(max_nseq+1) overflows int max_alen = 20; ESL_MSA *msa = NULL; int *V = NULL; int *useme = NULL; ESL_MSA *submsa = NULL; int nV; double avgid1, avgid2; double avgconn1, avgconn2; int i; int status; if ( esl_msa_Sample(rng, abc, max_nseq, max_alen, &msa) != eslOK) esl_fatal(msg); // a nastily sampled MSA ESL_ALLOC(V, sizeof(int) * msa->nseq); // ... (0,1 are edge cases of no pairs, that give us avgid = avgconn = 1.0) nV = 1 + esl_rnd_Roll(rng, msa->nseq); // we'll take a subset of size nV = 1..nseq. (can't do 0; esl_msa_SequenceSubset() can't extract 0 seqs.) if ( esl_rnd_Deal(rng, nV, msa->nseq, V) != eslOK) esl_fatal(msg); // select the random subset of nV indices ESL_ALLOC(useme, sizeof(int) * msa->nseq); // useme[0..nseq-1] = 0|1 flags for included seqs esl_vec_ISet(useme, msa->nseq, 0); for (i = 0; i < nV; i++) useme[V[i]] = 1; if ( esl_msa_SequenceSubset(msa, useme, &submsa) != eslOK) esl_fatal(msg); // extract the subset MSA if ( esl_dst_XAvgConnectivity (abc, submsa->ax, submsa->nseq, max_nseq*(max_nseq+1), 0.30, &avgid1, &avgconn1) != eslOK) esl_fatal(msg); // will always be exhaustive & exact, because max_nseq*(max_nseq+1) > any subseq N(N+1)/2 if ( esl_dst_XAvgSubsetConnectivity(abc, msa->ax, msa->nseq, V, nV, max_nseq*(max_nseq+1), 0.30, &avgid2, &avgconn2) != eslOK) esl_fatal(msg); // thus no stochastic variation; avgid and avgconn are expected to match exactly (up to fp error) if ( esl_DCompare(avgid1, avgid2, 1e-4, 1e-4) != eslOK) esl_fatal(msg); if ( esl_DCompare(avgconn1, avgconn2, 1e-4, 1e-4) != eslOK) esl_fatal(msg); esl_msa_Destroy(submsa); esl_msa_Destroy(msa); free(useme); free(V); return; ERROR: esl_fatal(msg); } #endif /* eslDISTANCE_TESTDRIVE */ /*------------------ end of unit tests --------------------------*/ /***************************************************************** * 8. Test driver. *****************************************************************/ #ifdef eslDISTANCE_TESTDRIVE #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_msa.h" #include "esl_random.h" #include "esl_distance.h" static ESL_OPTIONS options[] = { /* name type def env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0 }, { "-s", eslARG_INT, "0", NULL,"n>=0",NULL, NULL, NULL, "random number seed set to ", 0 }, { "-x", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "allow bad luck (rare stochastic failures)", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options]"; static char banner[] = "test driver for distance module"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); int allow_badluck = esl_opt_GetBoolean(go, "-x"); ESL_ALPHABET *abc = NULL; ESL_MSA *cmsa = NULL; // text mode ESL_MSA *xmsa = NULL; // digital mode fprintf(stderr, "## %s\n", argv[0]); fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng)); create_testmsa(&abc, &cmsa, &xmsa); utest_pairs (abc, xmsa->ax, cmsa->aseq, xmsa->nseq); utest_matrices(abc, xmsa->ax, cmsa->aseq, xmsa->nseq); utest_averages(abc, xmsa->ax, cmsa->aseq, xmsa->nseq); utest_subset_connectivity(rng, abc); /* tests that can stochastically fail must go last; by default they will reinit the RNG w/ fixed seed */ utest_sampling(rng, abc, allow_badluck); fprintf(stderr, "# status = ok\n"); esl_msa_Destroy(cmsa); esl_msa_Destroy(xmsa); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return eslOK; } #endif /*eslDISTANCE_TESTDRIVE*/ /***************************************************************** * 9. Example. *****************************************************************/ #ifdef eslDISTANCE_EXAMPLE /*::cexcerpt::distance_example::begin::*/ #include "easel.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_distance.h" int main(int argc, char **argv) { char *msafile = argv[1]; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int max_comparisons = 10000; double idthresh = 0.50; double avgid; double avgconn; int status; if ((status = esl_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); if ((status = esl_msafile_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status); esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); printf("esl_dst_XAvgId()...\n"); printf("average %%id: %.1f%%\n\n", avgid*100.); esl_dst_XAvgConnectivity(abc, msa->ax, msa->nseq, max_comparisons, idthresh, &avgid, &avgconn); printf("esl_dst_XAvgConnectivity()...\n"); printf("average %%id: %.1f%%\n", avgid*100.); printf("average %%conn at %.1f%%id: %.1f%%\n", idthresh*100, avgconn*100.); esl_msa_Destroy(msa); esl_msafile_Close(afp); esl_alphabet_Destroy(abc); return eslOK; } /*::cexcerpt::distance_example::end::*/ #endif /*eslDISTANCE_EXAMPLE*/