/* Phylogenetic trees. * * Contents: * 1. The ESL_TREE object. * 2. Newick format i/o * 3. Tree comparison algorithms. * 4. Clustering algorithms for distance-based tree construction. * 5. Generating simulated trees. * 6. Unit tests. * 7. Test driver. * 8. Examples. */ #include "esl_config.h" #include #include #include #include #include "easel.h" #include "esl_arr2.h" #include "esl_dmatrix.h" #include "esl_random.h" #include "esl_stack.h" #include "esl_vectorops.h" #include "esl_tree.h" /***************************************************************** *# 1. The ESL_TREE object. *****************************************************************/ /* Function: esl_tree_Create() * * Purpose: Allocate an empty tree structure for taxa * and return a pointer to it. must be $\geq 2$. * * Args: - number of taxa * * Returns: pointer to the new object; caller frees * this with . * * Throws: if allocation fails. */ ESL_TREE * esl_tree_Create(int ntaxa) { ESL_TREE *T = NULL; int i; int status; /* Contract verification */ ESL_DASSERT1((ntaxa >= 2)); /* 1st allocation round */ ESL_ALLOC(T, sizeof(ESL_TREE)); T->parent = NULL; T->left = NULL; T->right = NULL; T->ld = NULL; T->rd = NULL; /* Optional info starts NULL */ T->taxaparent = NULL; T->cladesize = NULL; T->taxonlabel = NULL; T->nodelabel = NULL; /* Additive trees are assumed by default, as opposed to linkage trees */ T->is_linkage_tree = FALSE; /* Tree output options default to PHYLIP style */ T->show_unrooted = FALSE; T->show_node_labels = TRUE; T->show_root_branchlength = FALSE; T->show_branchlengths = TRUE; T->show_quoted_labels = FALSE; T->show_numeric_taxonlabels = TRUE; /* 2nd allocation round */ T->N = ntaxa; ESL_ALLOC(T->parent, sizeof(int) * (ntaxa-1)); ESL_ALLOC(T->left, sizeof(int) * (ntaxa-1)); ESL_ALLOC(T->right, sizeof(int) * (ntaxa-1)); ESL_ALLOC(T->ld, sizeof(double) * (ntaxa-1)); ESL_ALLOC(T->rd, sizeof(double) * (ntaxa-1)); for (i = 0; i < ntaxa-1; i++) { T->parent[i] = 0; T->left[i ] = 0; T->right[i] = 0; T->ld[i] = 0.; T->rd[i] = 0.; } T->nalloc = ntaxa; return T; ERROR: esl_tree_Destroy(T); return NULL; } /* Function: esl_tree_CreateGrowable() * * Purpose: Allocate a growable tree structure for an initial * allocation of taxa, and return a pointer to it. * must be $\geq 2$. * * Args: - initial allocation size for number of taxa * * Returns: pointer to a new growable object; caller frees * this with . * * Throws: if allocation fails. */ ESL_TREE * esl_tree_CreateGrowable(int nalloc) { ESL_TREE *T = esl_tree_Create(nalloc); if (T == NULL) return NULL; T->N = 0; return T; } /* Function: esl_tree_CreateFromString() * * Purpose: A convenience for making small test cases in the test * suites: given the contents of a Newick file as a * single string , convert it to an . * * Returns: a pointer to the new on success. * * Throws: if it fails to obtain, open, or read the * temporary file that it puts the string in. */ ESL_TREE * esl_tree_CreateFromString(char *s) { char tmpfile[16] = "esltmpXXXXXX"; FILE *fp = NULL; ESL_TREE *T = NULL; if (esl_tmpfile(tmpfile, &fp) != eslOK) goto ERROR; fputs(s, fp); rewind(fp); if (esl_tree_ReadNewick(fp, NULL, &T) != eslOK) goto ERROR; fclose(fp); return T; ERROR: if (fp != NULL) fclose(fp); if (T != NULL) esl_tree_Destroy(T); return NULL; } /* Function: esl_tree_Grow() * * Purpose: Given a tree , make sure it can hold one more taxon; * reallocate internally if necessary by doubling the * number of taxa it is currently allocated to hold. * * Returns: on success. * * Throws: on allocation failure. In this case, * the data in the tree are unaffected. */ int esl_tree_Grow(ESL_TREE *T) { void *tmp; int nnew; int status; int i; if (T->N < T->nalloc) return eslOK; /* do we have room for next taxon? */ nnew = T->nalloc * 2; /* There are N-1 interior nodes, so arrays of info for * interior nodes are allocated for (nnew-1), whereas * arrays of info for the N taxa are allocated (nnew). */ ESL_RALLOC(T->parent, tmp, sizeof(int) * (nnew-1)); ESL_RALLOC(T->left, tmp, sizeof(int) * (nnew-1)); ESL_RALLOC(T->right, tmp, sizeof(int) * (nnew-1)); ESL_RALLOC(T->ld, tmp, sizeof(double) * (nnew-1)); ESL_RALLOC(T->rd, tmp, sizeof(double) * (nnew-1)); /* 0..N-2 were already initialized or used. * Initialize newly alloced space N-1..nnew-2. */ for (i = T->nalloc-1; i < nnew-1; i++) { T->parent[i] = 0; T->left[i ] = 0; T->right[i] = 0; T->ld[i] = 0.; T->rd[i] = 0.; } if (T->taxaparent != NULL) { ESL_RALLOC(T->taxaparent, tmp, sizeof(int) * nnew); for (i = T->nalloc; i < nnew; i++) T->taxaparent[i] = 0; } if (T->cladesize != NULL) { ESL_RALLOC(T->cladesize, tmp, sizeof(int) * nnew); for (i = T->nalloc; i < nnew; i++) T->cladesize[i] = 0; } if (T->taxonlabel != NULL) { ESL_RALLOC(T->taxonlabel, tmp, sizeof(char *) * nnew); for (i = T->nalloc; i < nnew; i++) T->taxonlabel[i] = NULL; } if (T->nodelabel != NULL) { ESL_RALLOC(T->nodelabel, tmp, sizeof(char *) * (nnew-1)); for (i = T->nalloc-1; i < nnew-1; i++) T->nodelabel[i] = NULL; } T->nalloc = nnew; return eslOK; ERROR: return eslEMEM; } /* Function: esl_tree_SetTaxaParents() * * Purpose: Constructs the taxaparent[]> array in the tree * structure , by an O(N) traversal of the tree. * Upon return, taxaparent[i]> is the index * of the internal node that taxon is a child of. * * Args: T - the tree structure to map * * Returns: on success. * * Throws: on internal allocation error. In this case, the tree is * returned unchanged. * * Xref: STL11/63 */ int esl_tree_SetTaxaParents(ESL_TREE *T) { int i; int status; if (T->taxaparent != NULL) return eslOK; // map already exists. ESL_ALLOC(T->taxaparent, sizeof(int) * T->N); for (i = 0; i < T->N; i++) T->taxaparent[i] = 0; // solely to calm static analyzers. for (i = 0; i < T->N-1; i++) /* traversal order doesn't matter */ { if (T->left[i] <= 0) T->taxaparent[-(T->left[i])] = i; if (T->right[i] <= 0) T->taxaparent[-(T->right[i])] = i; } return eslOK; ERROR: if (T->taxaparent != NULL) { free(T->taxaparent); T->taxaparent = NULL; } return status; } /* Function: esl_tree_SetCladesizes() * * Purpose: Constructs the cladesize[]> array in tree structure * . Upon successful return, cladesize[i]> is the * number of taxa contained by the clade rooted at node * in the tree. For example, cladesize[0]> is $N$ by * definition, because 0 is the root of the tree. * * Returns: on success. * * Throws: on allocation error; in this case, the * original is unmodified. */ int esl_tree_SetCladesizes(ESL_TREE *T) { int i; int status; if (T->cladesize != NULL) return eslOK; /* already set. */ ESL_ALLOC(T->cladesize, sizeof(int) * (T->N-1)); esl_vec_ISet(T->cladesize, T->N-1, 0); for (i = T->N-2; i >= 0; i--) { /* taxon: ...else... internal node: */ if (T->left[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->left[i]]; if (T->right[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->right[i]]; } return eslOK; ERROR: if (T->cladesize != NULL) { free(T->cladesize); T->cladesize = NULL; } return status; } /* Function: esl_tree_SetTaxonlabels() * * Purpose: Given an array of taxon names with the * same order and number as the taxa in tree , make a * copy of those names in . For example, might * be the sequence names in a multiple alignment, * sqname>. * * If the tree already had taxon names assigned to it, they * are replaced. * * As a special case, if the argument is passed as * , then the taxon labels are set to a string * corresponding to their internal index; that is, taxon 0 * is labeled "0". * * Returns: on success, and internal state of * (specifically, taxonlabel[]>) now contains a copy * of the taxa names. * * Throws: on allocation failure. taxonlabel[]> will be * (even if it was already set). */ int esl_tree_SetTaxonlabels(ESL_TREE *T, char **names) { int i; int status; if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->N); ESL_ALLOC(T->taxonlabel, sizeof(char *) * T->nalloc); for (i = 0; i < T->nalloc; i++) T->taxonlabel[i] = NULL; if (names != NULL) { for (i = 0; i < T->N; i++) if ((status = esl_strdup(names[i], -1, &(T->taxonlabel[i]))) != eslOK) goto ERROR; } else { for (i = 0; i < T->N; i++) { ESL_ALLOC(T->taxonlabel[i], sizeof(char) * 32); /* enough width for any conceivable int */ snprintf(T->taxonlabel[i], 32, "%d", i); } } return eslOK; ERROR: if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc); return status; } /* Function: esl_tree_RenumberNodes() * Synopsis: Assure nodes are numbered in preorder. * * Purpose: Given a tree whose internal nodes might be numbered in * any order, with the sole requirement that node 0 is the * root; renumber the internal nodes (if necessary) to be in Easel's * convention of preorder traversal. No other aspect of is * altered (including its allocation size). * * Returns: on success. * * Throws: on allocation failure. * * Xref: STL11/77 */ int esl_tree_RenumberNodes(ESL_TREE *T) { ESL_TREE *T2 = NULL; ESL_STACK *vs = NULL; int *map = NULL; int v,new; int status; int needs_rearranging = FALSE; /* Pass 1. Preorder traverse of T by its children links; * construct map[old] -> new. */ ESL_ALLOC(map, sizeof(int) * (T->N-1)); if (( vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; }; if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; }; new = 0; while (esl_stack_IPop(vs, &v) == eslOK) { if (v != new) needs_rearranging = TRUE; map[v] = new++; if (T->right[v] > 0 && esl_stack_IPush(vs, T->right[v]) != eslOK) { status = eslEMEM; goto ERROR; }; if (T->left[v] > 0 && esl_stack_IPush(vs, T->left[v]) != eslOK) { status = eslEMEM; goto ERROR; }; } if (! needs_rearranging) { status = eslOK; goto ERROR; } /* not an error; just cleaning up & returning eslOK. */ /* Pass 2. Construct the guts of correctly numbered new T2. * (traversal order doesn't matter here) */ if (( T2 = esl_tree_Create(T->nalloc)) == NULL) { status = eslEMEM; goto ERROR; }; if (T->nodelabel != NULL) { ESL_ALLOC(T2->nodelabel, sizeof(char *) * (T2->nalloc-1)); for (v = 0; v < T2->nalloc-1; v++) T2->nodelabel[v] = NULL; } if (T->taxaparent != NULL) { ESL_ALLOC(T2->taxaparent, sizeof(int) * (T2->nalloc)); for (v = 0; v < T2->nalloc; v++) T2->taxaparent[v] = 0; } for (v = 0; v < T->N-1; v++) { T2->parent[map[v]] = map[T->parent[v]]; if (T->left[v] > 0) T2->left[map[v]] = map[T->left[v]]; /* internal nodes renumbered... */ else T2->left[map[v]] = T->left[v]; /* ...taxon indices unchanged */ if (T->right[v] > 0) T2->right[map[v]] = map[T->right[v]]; else T2->right[map[v]] = T->right[v]; T2->ld[map[v]] = T->ld[v]; T2->rd[map[v]] = T->rd[v]; if (T->taxaparent != NULL) { if (T->left[v] <= 0) T2->taxaparent[-(T->left[v])] = map[v]; if (T->right[v] <= 0) T2->taxaparent[-(T->right[v])] = map[v]; } if (T->nodelabel != NULL) esl_strdup(T->nodelabel[v], -1, &(T2->nodelabel[map[v]])); } /* Finally, swap the new guts of T2 with the old guts of T; * destroy T2 and return. T is now renumbered. */ ESL_SWAP(T->parent, T2->parent, int *); ESL_SWAP(T->left, T2->left, int *); ESL_SWAP(T->right, T2->right, int *); ESL_SWAP(T->ld, T2->ld, double *); ESL_SWAP(T->rd, T2->rd, double *); ESL_SWAP(T->taxaparent, T2->taxaparent, int *); ESL_SWAP(T->nodelabel, T2->nodelabel, char **); free(map); esl_stack_Destroy(vs); esl_tree_Destroy(T2); return eslOK; ERROR: if (map != NULL) free(map); if (vs != NULL) esl_stack_Destroy(vs); if (T2 != NULL) esl_tree_Destroy(T2); return status; } /* Function: esl_tree_VerifyUltrametric() * * Purpose: Verify that tree is ultrametric. * * Returns: if so; if not. * * Throws: on an allocation failure. */ int esl_tree_VerifyUltrametric(ESL_TREE *T) { double *d = NULL; /* Distance from root for each OTU */ int status; int i, child, parent; /* First, calculate distance from root to each taxon. * (This chunk of code might be useful to put on its own someday.) */ ESL_ALLOC(d, sizeof(double) * T->N); if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR; for (i = 0; i < T->N; i++) { d[i] = 0.0; parent = T->taxaparent[i]; if (T->left[parent] == -i) d[i] += T->ld[parent]; else if (T->right[parent] == -i) d[i] += T->rd[parent]; else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops"); while (parent != 0) /* upwards to the root */ { child = parent; parent = T->parent[child]; if (T->left[parent] == child) d[i] += T->ld[parent]; else if (T->right[parent] == child) d[i] += T->rd[parent]; else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops"); } } /* In an ultrametric tree, all those distances must be equal. */ for (i = 1; i < T->N; i++) if ((status = esl_DCompare(d[0], d[i], 0.0001)) != eslOK) break; free(d); return status; ERROR: if (d != NULL) free(d); return status; } /* Function: esl_tree_Validate() * * Purpose: Validates the integrity of the data structure in . * Returns if the internal data in are * consistent and valid. Returns if not, * and if a non- message buffer has been * provided by the caller, an informative message is * left in describing the reason for the * failure. * * Args: T - tree structure to validate * errbuf - NULL, or a buffer of at least p7_ERRBUFSIZE * chars to contain an error message upon * any validation failure. */ int esl_tree_Validate(ESL_TREE *T, char *errbuf) { int N; int i, c; int shouldbe; int status; if (errbuf != NULL) *errbuf = 0; N = T->N; /* just to save writing T->N so many times below */ if (N < 2) ESL_XFAIL(eslFAIL, errbuf, "number of taxa is less than 2"); if (T->parent[0] != 0) ESL_XFAIL(eslFAIL, errbuf, "parent of root 0 should be set to 0"); if (T->nalloc < N) ESL_XFAIL(eslFAIL, errbuf, "number of taxa N is less than allocation"); /* Verify preorder tree numbering. */ for (i = 0; i < N-1; i++) { if (T->left[i] > 0 && T->left[i] < i) ESL_XFAIL(eslFAIL, errbuf, "l child of node %d not in preorder", i); if (T->right[i] > 0 && T->right[i] < i) ESL_XFAIL(eslFAIL, errbuf, "r child of node %d not in preorder", i); } /* Range checks on values. */ for (i = 0; i < N-1; i++) { if (T->parent[i] < 0 || T->parent[i] > N-2) ESL_XFAIL(eslFAIL, errbuf, "parent idx of node %d invalid", i); if (T->left[i] < -(N-1) || T->left[i] > N-2) ESL_XFAIL(eslFAIL, errbuf, "left child idx of node %d invalid", i); if (T->right[i] < -(N-1) || T->right[i] > N-2) ESL_XFAIL(eslFAIL, errbuf, "right child idx of node %d invalid", i); if (T->ld[i] < 0.) ESL_XFAIL(eslFAIL, errbuf, "negative l branch length at node %d", i); if (T->rd[i] < 0.) ESL_XFAIL(eslFAIL, errbuf, "negative r branch length at node %d", i); if (T->cladesize != NULL && (T->cladesize[i] < 0 || T->cladesize[i] > N)) ESL_XFAIL(eslFAIL, errbuf, "invalid cladesize at node %d", i); } for (c = 0; c < N; c++) if (T->taxaparent != NULL && (T->taxaparent[c] < 0 || T->taxaparent[c] > N-2)) ESL_XFAIL(eslFAIL, errbuf, "invalid taxaparent at node %d", c); /* more sophisticated integrity checks on parent-child relations in nodes ...*/ for (i = 1; i < T->N-1; i++) if (T->left[T->parent[i]] != i && T->right[T->parent[i]] != i) ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at internal node %d\n", i); /* ...and between terminal nodes and taxa. */ if (T->taxaparent != NULL) for (c = 0; c < T->N; c++) if (T->left[T->taxaparent[c]] != -c && T->right[T->taxaparent[c]] != -c) ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at taxon %d\n", c); /* check on cladesizes */ if (T->cladesize != NULL) for (i = 0; i < T->N-1; i++) { shouldbe = 0; if (T->left[i] > 0) shouldbe += T->cladesize[T->left[i]]; else shouldbe++; if (T->right[i] > 0) shouldbe += T->cladesize[T->right[i]]; else shouldbe++; if (shouldbe != T->cladesize[i]) ESL_XFAIL(eslFAIL, errbuf, "incorrect cladesize at node %d", i); } return eslOK; ERROR: return status; } /* Function: esl_tree_Destroy() * * Purpose: Frees an object. */ void esl_tree_Destroy(ESL_TREE *T) { if (T == NULL) return; esl_free(T->parent); esl_free(T->left); esl_free(T->right); esl_free(T->ld); esl_free(T->rd); esl_free(T->taxaparent); esl_free(T->cladesize); esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc); esl_arr2_Destroy((void **) T->nodelabel, T->nalloc-1); free(T); return; } /*----------------- end, ESL_TREE object -----------------------*/ /***************************************************************** *# 2. Newick format i/o *****************************************************************/ /* newick_validate_unquoted(): * Returns if we can represent