/* Pairwise identities, distances, and distance matrices. * * Contents: * 1. Pairwise distances for aligned text sequences. * 2. Pairwise distances for aligned digital seqs. * 3. Distance matrices for aligned text sequences. * 4. Distance matrices for aligned digital sequences. * 5. Average pairwise identity for multiple alignments. * 6. Private (static) functions. * 7. Unit tests. * 8. Test driver. * 9. Example. */ #include "esl_config.h" #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_dmatrix.h" #include "esl_random.h" #include "esl_distance.h" /* Forward declaration of our static functions. */ static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance); /***************************************************************** * 1. Pairwise distances for aligned text sequences. *****************************************************************/ /* Function: esl_dst_CPairId() * Synopsis: Pairwise identity of two aligned text strings. * Incept: SRE, Mon Apr 17 20:06:07 2006 [St. Louis] * * Purpose: Calculates pairwise fractional identity between two * aligned character strings and . * Return this distance in ; return the * number of identities counted in ; and * return the denominator in * . * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively for identity. Any nonalphabetic * character is assumed to be a gap symbol. * * This simple comparison rule is unaware of synonyms and * degeneracies in biological alphabets. For a more * sophisticated and biosequence-aware comparison, use * digitized sequences and the function * instead. Note that currently does * not correctly handle degeneracies, but is set up to. * * Args: asq1 - aligned character string 1 * asq2 - aligned character string 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: on success. , , * contain the answers (for whichever were passed non-NULL). * * Throws: if the strings are different lengths * (not aligned). */ int esl_dst_CPairId(const char *asq1, const char *asq2, double *opt_pid, int *opt_nid, int *opt_n) { int status; int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ idents = len1 = len2 = 0; for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++) { if (isalpha(asq1[i])) len1++; if (isalpha(asq2[i])) len2++; if (isalpha(asq1[i]) && isalpha(asq2[i]) && toupper(asq1[i]) == toupper(asq2[i])) idents++; } if (asq1[i] != '\0' || asq2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pid != NULL) *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2)); if (opt_nid != NULL) *opt_nid = idents; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_pid != NULL) *opt_pid = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; } /* Function: esl_dst_CPairMatch() * Synopsis: Pairwise matches of two aligned text strings. * Incept: ER, Wed Oct 29 09:02:35 EDT 2014 [janelia] * * Purpose: Calculates pairwise fractional matches between two * aligned character strings and . * Return this distance in ; return the * number of matches counted in ; and * return the denominator in * . * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively for identity. Any nonalphabetic * character is assumed to be a gap symbol. * * This simple comparison rule is unaware of synonyms and * degeneracies in biological alphabets. For a more * sophisticated and biosequence-aware comparison, use * digitized sequences and the function * instead. Note that currently does * not correctly handle degeneracies, but is set up to. * * Args: asq1 - aligned character string 1 * asq2 - aligned character string 2 * opt_pmatch - optRETURN: pairwise matches, 0<=x<=1 * opt_nmatch - optRETURN: # of matches * opt_n - optRETURN: denominator alen - double_gaps * * Returns: on success. , , * contain the answers (for whichever were passed non-NULL). * * Throws: if the strings are different lengths * (not aligned). */ int esl_dst_CPairMatch(const char *asq1, const char *asq2, double *opt_pmatch, int *opt_nmatch, int *opt_n) { int status; int match; /* total matched positions */ int len; /* length of alignment (no double gaps) */ int i; /* position in aligned seqs */ match = len = 0; for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++) { if (isalpha(asq1[i]) || isalpha(asq2[i])) len++; if (isalpha(asq1[i]) && isalpha(asq2[i])) match++; } if (asq1[i] != '\0' || asq2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pmatch != NULL) *opt_pmatch = ( len==0 ? 0. : (double)match / (double)len); if (opt_nmatch != NULL) *opt_nmatch = match; if (opt_n != NULL) *opt_n = len; return eslOK; ERROR: if (opt_pmatch != NULL) *opt_pmatch = 0.; if (opt_nmatch != NULL) *opt_nmatch = 0; if (opt_n != NULL) *opt_n = 0; return status; } /* Function: esl_dst_CJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned strings. * Incept: SRE, Tue Apr 18 14:00:37 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between * two aligned character strings and , in * substitutions/site, for an alphabet of residues * ( for nucleic acid, for proteins). The * maximum likelihood estimate for the distance is * optionally returned in . The large-sample * variance for the distance estimate is * optionally returned in . * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively to count the number of identities * () and mismatches (>). Any nonalphabetic * character is assumed to be a gap symbol, and aligned * columns containing gap symbols are ignored. The * fractional difference used to calculate the * Jukes/Cantor distance is . * * Args: K - size of the alphabet (4 or 20) * as1 - 1st aligned seq, 0..L-1, \0-terminated * as2 - 2nd aligned seq, 0..L-1, \0-terminated * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: on success. * * Infinite distances are possible, in which case distance * and variance are both . Caller has to deal * with this case as it sees fit, perhaps by enforcing * an arbitrary maximum distance. * * Throws: if the two strings aren't the same length (and * thus can't have been properly aligned). * if no aligned residues were counted. * On either failure, distance and variance are both returned * as . */ int esl_dst_CJukesCantor(int K, const char *as1, const char *as2, double *opt_distance, double *opt_variance) { int status; int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ /* 1. Count identities, mismatches. */ n1 = n2 = 0; for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) { if (isalpha(as1[i]) && isalpha(as2[i])) { if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++; } } if (as1[i] != '\0' || as2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */ ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; } /*------- end, pairwise distances for aligned text seqs ---------*/ /***************************************************************** * 2. Pairwise distances for aligned digitized sequences. *****************************************************************/ /* Function: esl_dst_XPairId() * Synopsis: Pairwise identity of two aligned digital seqs. * Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis] * * Purpose: Digital version of : and * are digitized aligned sequences, in alphabet * . Otherwise, same as except * that only canonical residues are counted and checked for * identity, while (which has no * alphabet) counts and checks identity of all alphanumeric * characters. * * This function does not use to handle * degeneracies but it is set up to do so. Doing that would * require that be changed to a float or double, * or its meaning be changed to be the number of canonical * identities. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: on success. , , * contain the answers, for any of these that were passed * non- pointers. * * Throws: if the strings are different lengths (not aligned). */ int esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_distance, int *opt_nid, int *opt_n) { int status; int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ idents = len1 = len2 = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax1[i])) len1++; if (esl_abc_XIsCanonical(abc, ax2[i])) len2++; if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i]) && ax1[i] == ax2[i]) idents++; } if (len2 < len1) len1 = len2; if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_distance != NULL) *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 ); if (opt_nid != NULL) *opt_nid = idents; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; } /* Function: esl_dst_XPairMatch() * Synopsis: Pairwise matches of two aligned digital seqs. * Incept: ER, Wed Oct 29 09:09:07 EDT 2014 [janelia] * * Purpose: Digital version of : and * are digitized aligned sequences, in alphabet * . Otherwise, same as except * that only canonical residues are counted and checked for * identity, while (which has no * alphabet) counts and checks identity of all alphanumeric * characters. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pmatch - optRETURN: pairwise matches, 0<=x<=1 * opt_nmatch - optRETURN: # of maches * opt_n - optRETURN: denominator alen-double_gaps * * Returns: on success. , , * contain the answers, for any of these that were passed * non- pointers. * * Throws: if the strings are different lengths (not aligned). */ int esl_dst_XPairMatch(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_distance, int *opt_nmatch, int *opt_n) { int status; int match; /* total matched positions */ int len; /* length of alignment (no double gaps) */ int i; /* position in aligned seqs */ match = len = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax1[i]) || esl_abc_XIsCanonical(abc, ax2[i])) len ++; if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])) match++; } if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_distance != NULL) *opt_distance = ( len==0 ? 0. : (double)match / (double)len ); if (opt_nmatch != NULL) *opt_nmatch = match; if (opt_n != NULL) *opt_n = len; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = 0.; if (opt_nmatch != NULL) *opt_nmatch = 0; if (opt_n != NULL) *opt_n = 0; return status; } /* Function: esl_dst_XJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned digitized seqs. * Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between two * aligned digital strings and , in substitutions/site, * using alphabet to evaluate identities and differences. * The maximum likelihood estimate for the distance is optionally returned in * . The large-sample variance for the distance * estimate is optionally returned in . * * Identical to , except that it takes * digital sequences instead of character strings. * * Args: abc - bioalphabet to use for comparisons * ax - 1st digital aligned seq * ay - 2nd digital aligned seq * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: on success. As in , the * distance and variance may be infinite, in which case they * are returned as . * * Throws: if the two strings aren't the same length (and * thus can't have been properly aligned). * if no aligned residues were counted. * On either failure, the distance and variance are set * to . */ int esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay, double *opt_distance, double *opt_variance) { int status; int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ n1 = n2 = 0; for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i])) { if (ax[i] == ay[i]) n1++; else n2++; } } if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, abc->K, opt_distance, opt_variance); ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; } /*---------- end pairwise distances, digital seqs --------------*/ /***************************************************************** * 3. Distance matrices for aligned text sequences. *****************************************************************/ /* Function: esl_dst_CPairIdMx() * Synopsis: NxN identity matrix for N aligned text sequences. * Incept: SRE, Thu Apr 27 08:46:08 2006 [New York] * * Purpose: Given a multiple sequence alignment , consisting * of aligned character strings; calculate * a symmetric fractional pairwise identity matrix by $N(N-1)/2$ * calls to , and return it in * . * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * ret_S - RETURN: symmetric fractional identity matrix * * Returns: on success, and contains the fractional * identity matrix. Caller free's with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S) { ESL_DMATRIX *S = NULL; int status; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; } /* Function: esl_dst_CDiffMx() * Synopsis: NxN difference matrix for N aligned text sequences. * Incept: SRE, Fri Apr 28 06:27:20 2006 [New York] * * Purpose: Same as , but calculates * the fractional difference instead of the * fractional identity for each pair. * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * ret_D - RETURN: symmetric fractional difference matrix * * Returns: on success, and contains the * fractional difference matrix. Caller free's with * . * * Throws: if any seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. */ int esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D) { ESL_DMATRIX *D = NULL; int status; int i,j; status = esl_dst_CPairIdMx(as, N, &D); if (status != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (ret_D != NULL) *ret_D = NULL; return status; } /* Function: esl_dst_CJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned text seqs. * Incept: SRE, Tue Apr 18 16:00:16 2006 [St. Louis] * * Purpose: Given a multiple sequence alignment , consisting of * aligned character sequences in an alphabet of * letters (usually 4 for DNA, 20 for protein); * calculate a symmetric Jukes/Cantor pairwise distance * matrix for all sequence pairs, and optionally return the distance * matrix in , and optionally return a symmetric matrix of the * large-sample variances for those ML distance estimates * in . * * Infinite distances (and variances) are possible; they * are represented as in and . Caller must * be prepared to deal with them as appropriate. * * Args: K - size of the alphabet (usually 4 or 20) * aseq - aligned sequences [0.nseq-1][0..L-1] * nseq - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: on success. and contain the * distance matrix (and variances); caller frees these with * . * * Throws: if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * if some pair of sequences had no aligned * residues. On failure, and are both returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_CJukesCantorMx(int K, char **aseq, int nseq, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { int status; ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int i,j; if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; } if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < nseq; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < nseq; j++) { status = esl_dst_CJukesCantor(K, aseq[i], aseq[j], &(D->mx[i][j]), &(V->mx[i][j])); if (status != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at seqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (V != NULL) esl_dmatrix_Destroy(V); if (opt_D != NULL) *opt_D = NULL; if (opt_V != NULL) *opt_V = NULL; return status; } /*----------- end, distance matrices for aligned text seqs ---------*/ /***************************************************************** * 4. Distance matrices for aligned digital sequences. *****************************************************************/ /* Function: esl_dst_XPairIdMx() * Synopsis: NxN identity matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York] * * Purpose: Given a digitized multiple sequence alignment , consisting * of aligned digital sequences in alphabet ; calculate * a symmetric pairwise fractional identity matrix by $N(N-1)/2$ * calls to , and return it in . * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_S - RETURN: NxN matrix of fractional identities * * Returns: on success, and contains the distance * matrix. Caller is obligated to free with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S) { int status; ESL_DMATRIX *S = NULL; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; } /* Function: esl_dst_XDiffMx() * Synopsis: NxN difference matrix for N aligned digital seqs. * Incept: SRE, Fri Apr 28 06:37:29 2006 [New York] * * Purpose: Same as , but calculates fractional * difference <1-s> instead of fractional identity for * each pair. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_D - RETURN: NxN matrix of fractional differences * * Returns: on success, and contains the difference * matrix; caller is obligated to free with * . * * Throws: if a seq has a different * length than others. On failure, is returned * and state of inputs is unchanged. */ int esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D) { int status; ESL_DMATRIX *D = NULL; int i,j; status = esl_dst_XPairIdMx(abc, ax, N, &D); if (status != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (ret_D != NULL) *ret_D = NULL; return status; } /* Function: esl_dst_XJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City] * * Purpose: Given a digitized multiple sequence alignment , * consisting of aligned digital sequences in * bioalphabet , calculate a symmetric Jukes/Cantor * pairwise distance matrix for all sequence pairs; * optionally return the distance matrix in and * a matrix of the large-sample variances for those ML distance * estimates in . * * Infinite distances (and variances) are possible. They * are represented as in and . Caller must * be prepared to deal with them as appropriate. * * Args: abc - bioalphabet for * ax - aligned digital sequences [0.nseq-1][1..L] * nseq - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: on success. (and optionally ) contain the * distance matrix (and variances). Caller frees these with * . * * Throws: if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * if some pair of sequences had no aligned * residues. On failure, and are both returned * and state of inputs is unchanged. * * on allocation failure. */ int esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int status; int i,j; if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; } if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; } for (i = 0; i < nseq; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < nseq; j++) { status = esl_dst_XJukesCantor(abc, ax[i], ax[j], &(D->mx[i][j]), &(V->mx[i][j])); if (status != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (V != NULL) esl_dmatrix_Destroy(V); if (opt_D != NULL) *opt_D = NULL; if (opt_V != NULL) *opt_V = NULL; return status; } /*------- end, distance matrices for digital alignments ---------*/ /***************************************************************** * 5. Average pairwise identity for multiple alignments *****************************************************************/ /* Function: esl_dst_CAverageId() * Synopsis: Calculate avg identity for multiple alignment * Incept: SRE, Fri May 18 15:02:38 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a multiple sequence alignment , consisting of * aligned character sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*ret_id> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id) { int status; double id; double sum = 0.; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is nseq small enough that we can average over all pairwise comparisons? */ if ((N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */ for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = sum; return eslOK; } /* Function: esl_dst_CAverageMatch() * Synopsis: Calculate avg matches for multiple alignment * Incept: ER, Wed Oct 29 09:25:09 EDT 2014 [Janelia] * * Purpose: Calculates the average pairwise fractional matches in * a multiple sequence alignment , consisting of * aligned character sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise matches (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*ret_match> contains the average * fractional matches. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_match> is set to 0. */ int esl_dst_CAverageMatch(char **as, int N, int max_comparisons, double *ret_match) { int status; double match; double sum = 0.; int i,j,n; if (N <= 1) { *ret_match = 1.; return eslOK; } *ret_match = 0.; /* Is nseq small enough that we can average over all pairwise comparisons? */ if ((N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status; sum += match; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */ for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status; sum += match; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_match = sum; return eslOK; } /* Function: esl_dst_XAverageId() * Synopsis: Calculate avg identity for digital MSA * Incept: SRE, Fri May 18 15:19:14 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a digital multiple sequence alignment , consisting of * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*ret_id> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id) { int status; double id; double sum = 0.; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is N small enough that we can average over all pairwise comparisons? watch out for numerical overflow in this: Pfam N's easily overflow when squared */ if (N <= max_comparisons && N <= sqrt(2. * max_comparisons) && (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */ for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = sum; return eslOK; } /* Function: esl_dst_XAverageMatch() * Synopsis: Calculate avg matches for digital MSA * Incept: ER, ed Oct 29 09:29:05 EDT 2014 [Janelia] * * Purpose: Calculates the average pairwise fractional matches in * a digital multiple sequence alignment , consisting of * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise matches (range $[0..$ pid $..1]$ * is calculated using . * * Returns: on success, and <*ret_match> contains the average * fractional identity. * * Throws: on allocation failure. * if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_match> is set to 0. */ int esl_dst_XAverageMatch(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_match) { int status; double match; double sum = 0.; int i,j,n; if (N <= 1) { *ret_match = 1.; return eslOK; } *ret_match = 0.; /* Is N small enough that we can average over all pairwise comparisons? watch out for numerical overflow in this: Pfam N's easily overflow when squared */ if (N <= max_comparisons && N <= sqrt(2. * max_comparisons) && (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status; sum += match; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */ for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status; sum += match; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_match = sum; return eslOK; } /***************************************************************** * 6. Private (static) functions *****************************************************************/ /* jukescantor() * * The generalized Jukes/Cantor distance calculation. * Given identities and differences, for a * base alphabet size of (4 or 20); * calculate J/C distance in substitutions/site and * return it in ; calculate large-sample * variance and return it in . * * Returns if there are no data (). */ static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance) { int status; double D, K, N; double x; double distance, variance; ESL_DASSERT1( (n1 >= 0) ); ESL_DASSERT1( (n2 >= 0) ); ESL_DASSERT1( (alphabet_size >= 0) ); if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; } K = (double) alphabet_size; D = (double) n2 / (double) (n1+n2); N = (double) (n1+n2); x = 1. - D * K/(K-1.); if (x <= 0.) { distance = HUGE_VAL; variance = HUGE_VAL; } else { distance = -log(x) * K/(K-1); variance = exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N; } if (opt_distance != NULL) *opt_distance = distance; if (opt_variance != NULL) *opt_variance = variance; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; } /*--------------- end of private functions ----------------------*/ /***************************************************************** * 7. Unit tests. *****************************************************************/ #ifdef eslDISTANCE_TESTDRIVE /* Each unit test is given an alignment with certain known * properties: * seqs 0,1 are identical * seqs 0,2 are completely different * seqs 3..N are random * The alignment may contain gaps, so don't assume that the * # of compared residues == alignment length. The alignment * contains only canonical residues, because one of our tests * is that C and X functions give the same results. */ static int utest_CPairId(char **as, int N) { double pid; int nid; int nres; int L; int i,j; /* Self comparison gives identity = 1. */ L = strlen(as[0]); if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* So does 0,1 comparison */ if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* 0,2 comparison gives 0.0, 0 */ if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort(); if (pid != 0.0 || nid != 0 || nres > L) abort(); /* remaining comparisons shouldn't fail */ for (i = 3; i < N; i++) for (j = i; j < N; j++) { if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort(); if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort(); } /* API should accept NULL for return values */ if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort(); return eslOK; } static int utest_CJukesCantor(int K, char **as, int N) { double d, V; int i,j; /* Self comparison gives distance = 0. */ if (esl_dst_CJukesCantor(K, as[0], as[0], &d, &V) != eslOK) abort(); if (d != 0.0) abort(); /* So does 0,1 comparison */ if (esl_dst_CJukesCantor(K, as[0], as[1], &d, &V) != eslOK) abort(); if (d != 0.0) abort(); /* 0,2 comparison gives infinite distance (HUGE_VAL) */ if (esl_dst_CJukesCantor(K, as[0], as[2], &d, &V) != eslOK) abort(); if (d != HUGE_VAL) abort(); /* remaining comparisons shouldn't fail */ for (i = 3; i < N; i++) for (j = i; j < N; j++) if (esl_dst_CJukesCantor(K, as[i], as[j], &d, &V) != eslOK) abort(); /* API should accept NULL for return values */ if (esl_dst_CJukesCantor(K, as[0], as[0], NULL, NULL) != eslOK) abort(); return eslOK; } static int utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { double pid, pid2; int nid, nid2; int nres, nres2; int dL, L; int i,j; /* Self comparison gives identity = 1. */ dL = esl_abc_dsqlen(ax[0]); L = strlen(as[0]); if (dL != L) abort(); if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > dL) abort(); /* So does 0,1 comparison */ if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* 0,2 comparison gives 0.0, 0 */ if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort(); if (pid != 0.0 || nid != 0 || nres > L) abort(); /* remaining comparisons shouldn't fail, and should be identical to text mode */ for (i = 3; i < N; i++) for (j = i; j < N; j++) { if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort(); if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2) != eslOK) abort(); if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort(); if (pid != pid2 || nid != nid2 || nres != nres2) abort(); } /* API should accept NULL for return values */ if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort(); return eslOK; } static int utest_XJukesCantor(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { double d, V; int i,j; /* Self comparison gives distance = 0. */ if (esl_dst_XJukesCantor(abc, ax[0], ax[0], &d, &V) != eslOK) abort(); if (d != 0.0) abort(); /* So does 0,1 comparison */ if (esl_dst_XJukesCantor(abc, ax[0], ax[1], &d, &V) != eslOK) abort(); if (d != 0.0) abort(); /* 0,2 comparison gives infinite distance (HUGE_VAL) */ if (esl_dst_XJukesCantor(abc, ax[0], ax[2], &d, &V) != eslOK) abort(); if (d != HUGE_VAL) abort(); /* remaining comparisons shouldn't fail */ for (i = 3; i < N; i++) for (j = i; j < N; j++) if (esl_dst_XJukesCantor(abc, ax[i], ax[j], &d, &V) != eslOK) abort(); /* API should accept NULL for return values */ if (esl_dst_XJukesCantor(abc, ax[0], ax[0], NULL, NULL) != eslOK) abort(); return eslOK; } static int utest_CPairIdMx(char **as, int N) { ESL_DMATRIX *S; int i,j; double pid; if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort(); for (i = 0; i < N; i++) if (S->mx[i][i] != 1.0) abort(); pid = 0.; for (i = 3; i < N; i++) for (j = i+1; j < N; j++) pid += S->mx[i][j]; pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */ if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */ esl_dmatrix_Destroy(S); return eslOK; } static int utest_CDiffMx(char **as, int N) { ESL_DMATRIX *D; int i,j; double diff; if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort(); for (i = 0; i < N; i++) if (D->mx[i][i] != 0.0) abort(); diff = 0.; for (i = 3; i < N; i++) for (j = i+1; j < N; j++) diff += D->mx[i][j]; diff /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */ if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */ esl_dmatrix_Destroy(D); return eslOK; } static int utest_CJukesCantorMx(int K, char **as, int N) { ESL_DMATRIX *D, *V; /* just a crash test */ if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort(); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(V); return eslOK; } static int utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *S, *S2; int i, j; if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort(); if (esl_dst_CPairIdMx(as, N, &S2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort(); esl_dmatrix_Destroy(S); esl_dmatrix_Destroy(S2); return eslOK; } static int utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *D, *D2; int i, j; if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort(); if (esl_dst_CDiffMx(as, N, &D2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort(); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(D2); return eslOK; } static int utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *D, *D2, *V, *V2; int i, j; if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V) != eslOK) abort(); if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) { if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort(); if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort(); } esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(D2); esl_dmatrix_Destroy(V); esl_dmatrix_Destroy(V2); return eslOK; } #endif /* eslDISTANCE_TESTDRIVE */ /*------------------ end of unit tests --------------------------*/ /***************************************************************** * 8. Test driver. *****************************************************************/ #ifdef eslDISTANCE_TESTDRIVE #include "easel.h" #include "esl_alphabet.h" #include "esl_arr2.h" #include "esl_dmatrix.h" #include "esl_getopts.h" #include "esl_random.h" #include "esl_randomseq.h" #include "esl_distance.h" static ESL_OPTIONS options[] = { /* name type def env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0}, { "-N", eslARG_INT, "10", NULL,"n>3", NULL, NULL, NULL, "number of iid seqs in alignment",0}, { "-L", eslARG_INT, "50", NULL,"n>0", NULL, NULL, NULL, "length of seqs in alignment", 0}, { "--seed", eslARG_INT, "42", NULL,"n>=0",NULL, NULL, NULL, "random # seed", 0}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "Usage: ./testdrive-distance [-options]"; int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; char **as = NULL; /* aligned character seqs (random, iid) */ int N,L; /* # of seqs, and their aligned lengths */ int seed; int i,j; int status; double p[4]; /// ACGT probabilities ESL_DSQ **ax = NULL; // digitized alignment ESL_ALPHABET *abc = NULL; /* Process command line */ go = esl_getopts_Create(options); esl_opt_ProcessCmdline(go, argc, argv); esl_opt_VerifyConfig(go); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } L = esl_opt_GetInteger(go, "-L"); N = esl_opt_GetInteger(go, "-N"); seed = esl_opt_GetInteger(go, "--seed"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } esl_getopts_Destroy(go); /* Create a random DNA alignment; * force it to obey the conventions of the unit tests: * 0,1 are identical * 0,2 are completely dissimilar */ r = esl_randomness_Create(seed); for (i = 0; i < 4; i++) p[i] = 0.25; ESL_ALLOC(as, sizeof(char *) * N); for (i = 0; i < N; i++) ESL_ALLOC(as[i], sizeof(char) * (L+1)); esl_rsq_IID(r, "ACGT", p, 4, L, as[0]); strcpy(as[1], as[0]); esl_rsq_IID(r, "ACGT", p, 4, L, as[2]); for (j = 0; j < L; j++) while (as[2][j] == as[0][j]) as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)]; for (i = 3; i < N; i++) esl_rsq_IID(r, "ACGT", p, 4, L, as[i]); abc = esl_alphabet_Create(eslDNA); ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N); for (i = 0; i < N; i++) esl_abc_CreateDsq(abc, as[i], &(ax[i])); /* Unit tests */ if (utest_CPairId(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL; if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL; if (utest_CDiffMx(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL; if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL; esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); esl_arr2_Destroy((void **) as, N); esl_arr2_Destroy((void **) ax, N); return eslOK; ERROR: return eslFAIL; } #endif /*eslDISTANCE_TESTDRIVE*/ /***************************************************************** * 9. Example. *****************************************************************/ #ifdef eslDISTANCE_EXAMPLE /*::cexcerpt::distance_example::begin::*/ /* gcc -g -Wall -o example -I. -DeslDISTANCE_EXAMPLE esl_distance.c\ esl_dmatrix.c esl_msa.c easel.c -lm ./example */ #include "easel.h" #include "esl_distance.h" #include "esl_dmatrix.h" #include "esl_msa.h" int main(int argc, char **argv) { ESL_MSAFILE *afp; ESL_MSA *msa; ESL_DMATRIX *P; int i,j; double min, avg, max; int status; if ((status = esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); if ((status = esl_msafile_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status); esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P); min = 1.0; max = 0.0; avg = 0.0; for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { avg += P->mx[i][j]; if (P->mx[i][j] < min) min = P->mx[i][j]; if (P->mx[i][j] > max) max = P->mx[i][j]; } avg /= (double) (msa->nseq * (msa->nseq-1) / 2); printf("Average pairwise %% id: %.1f%%\n", avg * 100.); printf("Minimum pairwise %% id: %.1f%%\n", min * 100.); printf("Maximum pairwise %% id: %.1f%%\n", max * 100.); esl_dmatrix_Destroy(P); esl_msa_Destroy(msa); esl_msafile_Close(afp); return 0; } /*::cexcerpt::distance_example::end::*/ #endif /*eslDISTANCE_EXAMPLE*/