/* Memory-efficient multiple sequence alignment i/o from Pfam format * * Legacy interface, now that ESL_MSAFILE is rewritten. Just enough * of the old interface is retained to support --small option in * various tools, for reading Pfam format in memory-efficient ways. * * Table of contents: * 1. The ESL_MSAFILE2 object * 2. Memory efficient read/write in Pfam format * 3. Legacy Stockholm parsing tools * 4. Unit tests * 5. Test driver * * to-do: * :: add memory-efficient interface in ESL_MSAFILE * :: add memory-efficient ESL_MSA w/ API * :: add space-efficient MSA file format */ #include "esl_config.h" #include #include #include #include #include "easel.h" #include "esl_alphabet.h" /* digital alphabet */ #include "esl_arr2.h" #include "esl_arr3.h" #include "esl_keyhash.h" /* string hashes, for mapping unique seq names */ #include "esl_msa.h" /* ESL_MSA structure */ #include "esl_msafile.h" /* preferred msafile interface, inc. fmt codes shared w/ ESL_MSAFILE2 */ #include "esl_ssi.h" /* indexing large flatfiles on disk */ #include "esl_vectorops.h" #include "esl_wuss.h" #include "esl_msafile2.h" static int msafile2_getline(ESL_MSAFILE2 *afp); static int is_blankline(char *s); static int parse_gf(ESL_MSA *msa, char *buf); static int parse_gc(ESL_MSA *msa, char *buf); static int parse_comment(ESL_MSA *msa, char *buf); /***************************************************************** *# 1. The ESL_MSAFILE2 object *****************************************************************/ static int msafile2_open(const char *filename, const char *env, ESL_MSAFILE2 **ret_afp); /* Function: esl_msafile2_Open() * Synopsis: Open an MSA file for small-memory input. * * Purpose: Open an alignment database file , which must be * in Pfam format (), and prepare for * reading information through the legacy small-memory * interface. Return the opened file pointer in . * * There are one or two special cases for . If * is "-", then the alignment is read from * . If ends in ".gz", then the file is * assumed to be compressed by gzip, and it is opened as a * pipe from . (Auto-decompression of gzip'ed files * is only available on POSIX-compliant systems w/ popen(), when * is defined at compile-time.) * * If is non-NULL, then we look for in * one or more directories in a colon-delimited list * that is the value of the environment variable . * For example, if we had * * in the environment, a profile HMM application * might pass "HMMERDB" as . * * Returns: on success, and is set to point at * an open . Caller frees this file pointer with * . * * Returns if cannot be opened. * * Throws: on allocation failure. * * Note: Implemented as a wrapper around msafile2_open(), because * esl_msafile2_OpenDigital() shares almost all the same code. */ int esl_msafile2_Open(const char *filename, const char *env, ESL_MSAFILE2 **ret_afp) { return msafile2_open(filename, env, ret_afp); } /* Function: esl_msafile2_Close() * Synopsis: Closes an open MSA file. * * Purpose: Close an open . */ void esl_msafile2_Close(ESL_MSAFILE2 *afp) { if (afp == NULL) return; #ifdef HAVE_POPEN /* gzip functionality */ if (afp->do_gzip && afp->f != NULL) pclose(afp->f); #endif if (!afp->do_gzip && ! afp->do_stdin && afp->f != NULL) fclose(afp->f); if (afp->fname) free(afp->fname); if (afp->buf) free(afp->buf); if (afp->ssi) esl_ssi_Close(afp->ssi); if (afp->msa_cache) esl_msa_Destroy(afp->msa_cache); free(afp); } /* Function: esl_msafile2_OpenDigital() * Synopsis: Open an msa file for digital input. * * Purpose: Same as , except the alignment file * will be read into a digitized internal representation, * using internal alphabet , rather than the default * internal ASCII text representation. * * The file must be in Pfam format (). * * Args: abc - pointer to internal alphabet * filename - name of alignment data file to open; * if "*.gz", attempt to read through using ; * or "-" for stdin * env - NULL, or the name of an environment variable from which * to retrieve a colon-delimited directory list to search * for in. (e.g. "HMMERDB") * ret_msafp - RETURN: open . * * Returns: on success, and is set to point at * an open . Caller frees this file pointer with * . * * if cannot be opened; * if doesn't seem to be in Pfam format. * * Throws: on allocation failure. * if format autodetection is attempted on * stdin or a gzip -dc pipe. */ int esl_msafile2_OpenDigital(const ESL_ALPHABET *abc, const char *filename, const char *env, ESL_MSAFILE2 **ret_msafp) { ESL_MSAFILE2 *msafp; int status; if ((status = msafile2_open(filename, env, &msafp)) != eslOK) return status; msafp->abc = abc; msafp->do_digital = TRUE; *ret_msafp = msafp; return eslOK; } /* msafile2_open(): * this is the routine that actually opens an ESL_MSAFILE2; * esl_msafile2_Open() and esl_msafile2_OpenDigital() are wrappers around it. */ static int msafile2_open(const char *filename, const char *env, ESL_MSAFILE2 **ret_afp) { ESL_MSAFILE2 *afp = NULL; char *ssifile = NULL; char *envfile = NULL; char *cmd = NULL; int n = strlen(filename); int status; ESL_ALLOC(afp, sizeof(ESL_MSAFILE2)); afp->f = NULL; afp->fname = NULL; afp->linenumber = 0; afp->errbuf[0] = '\0'; afp->buf = NULL; afp->buflen = 0; afp->do_gzip = FALSE; afp->do_stdin = FALSE; afp->format = eslMSAFILE_PFAM; /* legacy interface is stripped down to support ONLY --small, Pfam */ afp->do_digital = FALSE; afp->abc = NULL; afp->ssi = NULL; afp->msa_cache = NULL; if (strcmp(filename, "-") == 0) { afp->f = stdin; afp->do_stdin = TRUE; if ((status = esl_strdup("[STDIN]", -1, &(afp->fname))) != eslOK) goto ERROR; } #ifdef HAVE_POPEN /* popen(), pclose() aren't portable to non-POSIX systems; * disable this section in strict ANSI C mode. */ /* tricky: if n= length of a string s, then * s+n-i repositions pointer s at the last i chars * of the string. */ else if (n > 3 && strcmp(filename+n-3, ".gz") == 0) { /* Note that popen() will return "successfully" * if file doesn't exist, because gzip works fine * and prints an error! So we have to check for * existence of file ourself. */ if (! esl_FileExists(filename)) { status = eslENOTFOUND; goto ERROR; } ESL_ALLOC(cmd, sizeof(char) * (n+1+strlen("gzip -dc "))); sprintf(cmd, "gzip -dc %s", filename); if ((afp->f = popen(cmd, "r")) == NULL) { status = eslENOTFOUND; goto ERROR; } if ((status = esl_strdup(filename, n, &(afp->fname))) != eslOK) goto ERROR; afp->do_gzip = TRUE; } #endif /*HAVE_POPEN*/ else /* Normal file open or env file open: set ssifile */ { /* When we open a file, it may be either in the current * directory, or in the directory indicated by the env * argument - and we construct an SSI filename accordingly. */ if ((afp->f = fopen(filename, "r")) != NULL) { if ((status = esl_strdup(filename, n, &ssifile)) != eslOK) goto ERROR; if ((status = esl_strcat(&ssifile, n, ".ssi", 4)) != eslOK) goto ERROR; if ((status = esl_strdup(filename, n, &(afp->fname))) != eslOK) goto ERROR; } else if (esl_FileEnvOpen(filename, env, &(afp->f), &envfile) == eslOK) { if ((status = esl_strdup(envfile, n, &ssifile)) != eslOK) goto ERROR; if ((status = esl_strcat(&ssifile, n, ".ssi", 4)) != eslOK) goto ERROR; if ((status = esl_strdup(envfile, n, &(afp->fname))) != eslOK) goto ERROR; } else { status = eslENOTFOUND; goto ERROR;} } /* Open the SSI index file. If it doesn't exist, or * it's corrupt, or some error happens, afp->ssi stays NULL. * We should warn, probably, or provide some way for caller to * to know that we've opened the index successfully or not. */ esl_ssi_Open(ssifile, &(afp->ssi)); if (envfile != NULL) free(envfile); if (ssifile != NULL) free(ssifile); if (cmd != NULL) free(cmd); *ret_afp = afp; return eslOK; ERROR: if (envfile != NULL) free(envfile); if (ssifile != NULL) free(ssifile); if (cmd != NULL) free(cmd); if (afp != NULL) esl_msafile2_Close(afp); *ret_afp = NULL; return status; } /*--------------- end, ESL_MSAFILE2 object ----------------------*/ /*------------------ end, digital mode -------------------------*/ /****************************************************************************** * 2. Memory efficient routines for PFAM format *****************************************************************************/ static int get_pp_idx(ESL_ALPHABET *abc, char ppchar); static int gapize_string(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str); static void shrink_string(char *str, const int *useme, int len); static int determine_spacelen(char *s); /* Function: esl_msafile2_ReadInfoPfam() * Synopsis: Read Pfam formatted MSA information but not sequence data. * * Purpose: Read the next alignment from an open Stockholm Pfam * (non-interleaved, one line per seq) format alignment file * and store all non-sequence information (comments, * GF annotation and GC annotation) in a new msa object. * * This function is not as rigorous about validating the * input msa as the other read functions that store the full * alignment. Here, we only verify that there is only one * line for the first sequence read. Verifying that all * sequences are only one line would require storing and * looking up all sequence names. * * Many optional return values () allow this function * to serve the diverse needs of the miniapps that can run * in a memory-efficient mode (esl-alimerge, esl-alimask, * esl-alistat, esl-ssdraw). For any that are unwanted, pass * . * * Args: afp - open alignment file pointer * listfp - if non-NULL, dump each sequence name we read * to listfp, separated by newlines * abc - alphabet to use, only nec and used if one * of the opt_*_ct arrays is non-NULL * known_alen - known length of the alignment, -1 if unknown * must not be -1, if known_rf != NULL or * known_ss_cons != NULL. * known_rf - known RF annot. (msa->rf) for this alignment, * might be known from prev call of this func, * for example. NULL if unknown. * known_ss_cons - the known SS_cons annotation (msa->ss_cons) * for this alignment, NULL if unknown. * ret_msa - RETURN: msa with comments, GC, GF * annotation but no sequence info (nor GS, GR), * pass NULL if not wanted. * opt_nseq - optRETURN: number of sequences in msa * opt_alen - optRETURN: length of first aligned sequence * opt_ngs - optRETURN: number of GS lines in alignment * opt_maxname - optRETURN: maximum seqname length * opt_maxgf - optRETURN: maximum GF tag length * opt_maxgc - optRETURN: maximum GC tag length * opt_maxgr - optRETURN: maximum GR tag length * opt_abc_ct - optRETURN: [0..apos..alen-1][0..abc->K] * per position count of each symbol in abc over all seqs * opt_pp_ct - optRETURN: [0..apos..alen-1][0..11], * per position count of PPs over all seqs, * [11] is gaps, [10] is '*', [0-9] are '0'-'9' * opt_bp_ct - optRETURN: [0..apos..alen-1][0..abc->K-1][0..abc->K-1] * per position count of each possible basepair * in alignment, for pair apos:apos2, where * apos < apos2 and apos:apos2 form a basepair * in . If non-NULL, * must be non-NULL. * opt_spos_ct - optRETURN: [0..apos..alen-1] per position count * of first nongap residue in each sequence, * ex: opt_spos_ct[100] = x means x seqs have their * first nongap residue at position 100 * opt_epos_ct - optRETURN: [0..apos..alen-1] same as opt_spos_ct, * except for final position instead of first * * Returns: on success. Returns if there are no more * alignments in , and is set to NULL and * are set to 0. * if parse fails because of a file format * problem, in which case errbuf> is set to contain a * formatted message that indicates the cause of the * problem, is set to and are set * to 0. * * Throws: on allocation error. * * Xref: ~nawrockie/notebook/9_1206_esl_msa_mem_efficient/ */ int esl_msafile2_ReadInfoPfam(ESL_MSAFILE2 *afp, FILE *listfp, ESL_ALPHABET *abc, int64_t known_alen, char *known_rf, char *known_ss_cons, ESL_MSA **ret_msa, int *opt_nseq, int64_t *opt_alen, int *opt_ngs, int *opt_maxname, int *opt_maxgf, int *opt_maxgc, int *opt_maxgr, double ***opt_abc_ct, double ***opt_pp_ct, double ****opt_bp_ct, int **opt_spos_ct, int **opt_epos_ct) { char *s; /* pointer to current character in afp */ int status; /* easel status code */ int status2; /* another easel status code */ ESL_MSA *msa = NULL; /* the msa we're creating */ int nseq = 0; /* number of sequences read */ int64_t alen = -1; /* length of the alignment */ int ngs = 0; /* number of GS lines read */ int maxname = 0; /* max length seq name */ int maxgf = 0; /* max length GF tag */ int maxgc = 0; /* max length GC tag */ int maxgr = 0; /* max length GR tag */ char *seqname; /* ptr to a sequence name */ int namelen; /* length of a sequence name */ char *first_seqname = NULL; /* name of first sequence read */ char *gf, *gc, *gr; /* for storing '#=GF', '#=GC', '#=GR', temporarily */ char *tag; /* a GC or GR tag */ int taglen; /* length of a tag */ char *text; /* text string */ int textlen; /* length of text string */ int i, x; /* counters */ int j; /* position for a right half of a bp */ int apos; /* counter over alignment positions */ double **abc_ct = NULL; /* [0..alen-1][0..abc->K] per position count of each residue in abc and gaps over all seqs */ double ***bp_ct = NULL; /* [0..alen-1][0..abc->Kp][0..abc->Kp], count of each possible base pair at each position, over all sequences, missing and nonresidues are *not counted* base pairs are indexed by 'i' for a base pair between positions i and j, where i < j. */ int nppvals = 12; /* '0'-'9' = 0-9, '*' = 10, gap = '11' */ double **pp_ct = NULL; /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */ int ppidx; /* index for 2nd dim of pp_ct array */ int *spos_ct = NULL; /* [0..alen-1] number of seqs that start (have first nongap residue) at each position */ int *epos_ct = NULL; /* [0..alen-1] number of seqs that end (have final nongap residue) at each position */ ESL_DSQ *tmp_dsq = NULL; /* temporary digitized sequence, only used if opt_abc_ct != NULL */ int *a2rf_map = NULL; /* [0..apos..known_alen-1] = rfpos, nongap RF position apos maps to, * -1 if apos is not a nongap RF position */ int *ct = NULL; /* 0..known_alen-1 base pair partners array for known_ss_cons */ char *ss_nopseudo = NULL; /* no-pseudoknot version of known_ss_cons */ if(afp->format != eslMSAFILE_PFAM) ESL_EXCEPTION(eslEINCONCEIVABLE, "only non-interleaved (1 line /seq, Pfam) Stockholm formatted files can be read in small memory mode"); if(opt_abc_ct != NULL && abc == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, abc == NULL, opt_abc_ct != NULL"); if(opt_pp_ct != NULL && abc == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, abc == NULL, opt_pp_ct != NULL"); if(opt_bp_ct != NULL && abc == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, abc == NULL, opt_bp_ct != NULL"); if(opt_spos_ct != NULL && abc == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, abc == NULL, opt_spos_ct != NULL"); if(opt_epos_ct != NULL && abc == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, abc == NULL, opt_epos_ct != NULL"); if(opt_spos_ct != NULL && known_alen == -1) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, opt_spos_ct != NULL, known_alen == -1"); if(opt_epos_ct != NULL && known_alen == -1) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, opt_epos_ct != NULL, known_alen == -1"); if(opt_bp_ct != NULL && known_ss_cons == NULL) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, known_ss_cons == NULL, opt_bp_ct != NULL"); if(known_rf != NULL && known_alen == -1) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, known_rf != NULL, known_alen == -1"); if(known_ss_cons != NULL && known_alen == -1) ESL_FAIL(eslEINVAL, afp->errbuf, "contract violation, known_ss_cons != NULL, known_alen == -1"); if (feof(afp->f)) { status = eslEOF; goto ERROR; } afp->errbuf[0] = '\0'; /* Preliminaries */ /* allocate and initialize spos_ct and epos_ct, if we'll return them */ if(opt_spos_ct != NULL || opt_epos_ct != NULL) { ESL_ALLOC(spos_ct, sizeof(int) * known_alen); ESL_ALLOC(epos_ct, sizeof(int) * known_alen); esl_vec_ISet(spos_ct, known_alen, 0); esl_vec_ISet(epos_ct, known_alen, 0); } /* if bp_ct != NULL, determine the ct array from the known_ss_cons, and allocate the bp_ct */ if(opt_bp_ct != NULL) { /* contract enforces that if this is true, known_ss_cons != NULL and known_alen != -1 */ /* get ct array which defines the consensus base pairs */ ESL_ALLOC(ct, sizeof(int) * (known_alen+1)); ESL_ALLOC(ss_nopseudo, sizeof(char) * (known_alen+1)); esl_wuss_nopseudo(known_ss_cons, ss_nopseudo); if ((status = esl_wuss2ct(ss_nopseudo, known_alen, ct)) != eslOK) ESL_FAIL(status, afp->errbuf, "consensus structure string is inconsistent."); ESL_ALLOC(bp_ct, sizeof(double **) * known_alen); for(apos = 0; apos < known_alen; apos++) { /* careful ct is indexed 1..alen, not 0..alen-1 */ if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */ ESL_ALLOC(bp_ct[apos], sizeof(double *) * (abc->Kp)); for(x = 0; x < abc->Kp; x++) { ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (abc->Kp)); esl_vec_DSet(bp_ct[apos][x], abc->Kp, 0.); } } else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */ bp_ct[apos] = NULL; } } } /* end of preliminaries */ /* Initialize allocation of the MSA: * We won't store any sequence information, so initial blocksize is * 0 seqs of 0 length. */ if (afp->do_digital == TRUE && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } if (afp->do_digital == FALSE && (msa = esl_msa_Create(16, -1)) == NULL) { status = eslEMEM; goto ERROR; } if (msa == NULL) { status = eslEMEM; goto ERROR; } /* Check the magic Stockholm header line. * We have to skip blank lines here, else we perceive * trailing blank lines in a file as a format error when * reading in multi-record mode. */ do { if ((status = msafile2_getline(afp)) != eslOK) goto ERROR; /* includes EOF */ } while (is_blankline(afp->buf)); if (strncmp(afp->buf, "# STOCKHOLM 1.", 14) != 0) ESL_XFAIL(eslEFORMAT, afp->errbuf, "parse failed (line %d): missing \"# STOCKHOLM\" header", afp->linenumber); /* Read the alignment file one line at a time. */ while ((status2 = msafile2_getline(afp)) == eslOK) { s = afp->buf; while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */ if (*s == '#') { if(strncmp(s, "#=GF", 4) == 0) { if (ret_msa != NULL) { if ((status = parse_gf(msa, s)) != eslOK) ESL_XFAIL(status, afp->errbuf, "parse failed (line %d): bad #=GF line", afp->linenumber); } else if (opt_maxgf != NULL) { /* we need to parse out GF tag len to see if it is > maxgf */ s = afp->buf; if (esl_strtok (&s, " \t\n\r", &gf) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GF line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &tag, &taglen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GF line", afp->linenumber); maxgf = ESL_MAX(maxgf, taglen); } } else if (strncmp(s, "#=GC", 4) == 0) { if (ret_msa != NULL) { if ((status = parse_gc(msa, s)) != eslOK) ESL_XFAIL(status, afp->errbuf, "parse failed (line %d): bad #=GC line", afp->linenumber); } else if (opt_maxgc != NULL) { /* we need to parse out GC tag len to see if it is > maxgc */ s = afp->buf; if (esl_strtok (&s, " \t\n\r", &gc) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &tag, &taglen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line", afp->linenumber); maxgc = ESL_MAX(maxgc, taglen); } } else if (strncmp(s, "#=GS", 4) == 0) { ngs++; } else if (strncmp(s, "#=GR", 4) == 0) { if(opt_maxgr != NULL || opt_pp_ct != NULL) { s = afp->buf; if (esl_strtok (&s, " \t\n\r", &gr) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &seqname, &namelen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &tag, &taglen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); maxgr = ESL_MAX(maxgr, taglen); if(opt_pp_ct != NULL) { if (strncmp(tag, "PP", 2) == 0) { if (esl_strtok_adv(&s, " \t\n\r", &text, &textlen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR PP line", afp->linenumber); /* verify, or set alignment length */ if(alen == -1) { /* first aligned text line, need to allocate pp_ct, and possibly abc_ct, spos_ct, epos_ct */ alen = textlen; if(known_alen != -1 && known_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): known alen (%" PRId64 " passed in) != actual alen (%d)", afp->linenumber, known_alen, textlen); ESL_ALLOC(pp_ct, sizeof(double *) * alen); for(apos = 0; apos < alen; apos++) { ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals); esl_vec_DSet(pp_ct[apos], nppvals, 0.); } if(opt_abc_ct != NULL || opt_bp_ct != NULL) { ESL_ALLOC(tmp_dsq, (alen+2) * sizeof(ESL_DSQ)); } if(opt_abc_ct != NULL) { ESL_ALLOC(abc_ct, sizeof(double *) * alen); for(apos = 0; apos < alen; apos++) { ESL_ALLOC(abc_ct[apos], sizeof(double) * (abc->K+1)); esl_vec_DSet(abc_ct[apos], (abc->K+1), 0.); } } } else if(alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR PP line, len %d, expected %" PRId64, afp->linenumber, textlen, alen); for(apos = 0; apos < alen; apos++) { /* update appropriate PP count */ if((ppidx = get_pp_idx(abc, text[apos])) == -1) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR PP char: %c", afp->linenumber, text[apos]); pp_ct[apos][ppidx] += 1.; } } } } } else if (ret_msa != NULL && ((status = parse_comment(msa, s)) != eslOK)) { ESL_XFAIL(status, afp->errbuf, "parse failed (line %d): bad comment line", afp->linenumber); } } /* end of 'if (*s == '#')' */ else if (strncmp(s, "//", 2) == 0) break; /* normal way out */ else if (*s == '\n' || *s == '\r') continue; else { /* sequence line */ if(listfp != NULL || opt_maxname != NULL || opt_alen != NULL || opt_abc_ct != NULL || opt_spos_ct != NULL || opt_epos_ct != NULL) { /* we need to parse out the seqname */ s = afp->buf; if (esl_strtok_adv(&s, " \t\n\r", &seqname, &namelen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad sequence line", afp->linenumber); if (listfp != NULL) fprintf(listfp, "%s\n", seqname); maxname = ESL_MAX(maxname, namelen); if (opt_alen != NULL || opt_abc_ct != NULL || opt_spos_ct != NULL || opt_epos_ct != NULL) { /* we need to parse out the seq */ if (esl_strtok_adv(&s, " \t\n\r", &text, &textlen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad sequence line", afp->linenumber); /* if first aligned seq read, store it's name, else see if it is an additional line of first aseq */ if(nseq == 0) { if ((status = esl_strdup(seqname, -1, &(first_seqname))) != eslOK) goto ERROR; } else if(strcmp(first_seqname, seqname) == 0) { ESL_XFAIL(eslEFORMAT, afp->errbuf, "parse failed (line %d): two seqs with same name. Alignment may be in interleaved Stockholm. Reformat to Pfam with esl-reformat.", afp->linenumber); } if(alen == -1) { /* first aligned text line, need to allocate pp_ct, and possibly abc_ct, spos_ct, epos_ct */ alen = textlen; if(known_alen != -1 && known_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): known alen (%" PRId64 " passed in) != actual alen (%d)", afp->linenumber, known_alen, textlen); if(opt_abc_ct != NULL || opt_bp_ct != NULL) { ESL_ALLOC(tmp_dsq, (alen+2) * sizeof(ESL_DSQ)); } if(opt_abc_ct != NULL) { ESL_ALLOC(abc_ct, sizeof(double *) * alen); for(apos = 0; apos < alen; apos++) { ESL_ALLOC(abc_ct[apos], sizeof(double) * (abc->K+1)); esl_vec_DSet(abc_ct[apos], (abc->K+1), 0.); } } if(opt_pp_ct != NULL) { ESL_ALLOC(pp_ct, sizeof(double *) * alen); for(apos = 0; apos < alen; apos++) { ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals); esl_vec_DSet(pp_ct[apos], nppvals, 0.); } } } else if(alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad aligned seq line, len %d, expected %" PRId64, afp->linenumber, textlen, alen); if(opt_abc_ct != NULL || opt_bp_ct != NULL) { /* update appropriate abc and/or bp count. first, digitize the text */ if((status = esl_abc_Digitize(abc, text, tmp_dsq)) != eslOK) ESL_XFAIL(status, afp->errbuf, "small mem parse failed (line %d): problem digitizing sequence", afp->linenumber); } if(opt_abc_ct != NULL) { for(apos = 0; apos < alen; apos++) { /* update appropriate abc count, careful, tmp_dsq ranges from 1..alen (not 0..alen-1) */ if((status = esl_abc_DCount(abc, abc_ct[apos], tmp_dsq[apos+1], 1.0)) != eslOK) ESL_XFAIL(status, afp->errbuf, "small mem parse failed (line %d): problem counting residue %d", afp->linenumber, apos+1); } } if(opt_bp_ct != NULL) { for(apos = 0; apos < alen; apos++) { /* update appropriate abc count, careful, tmp_dsq ranges from 1..alen (not 0..alen-1) */ if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */ j = ct[apos+1] - 1; /* ct is indexed 1..alen */ bp_ct[apos][tmp_dsq[(apos+1)]][tmp_dsq[(j+1)]]++; } } } if(opt_spos_ct != NULL) { for(apos = 0; apos < alen; apos++) { /* find first non-gap position */ if(! esl_abc_XIsGap(abc, tmp_dsq[apos+1])) { spos_ct[apos]++; break; } } } if(opt_epos_ct != NULL) { /* find final non-gap position */ for(apos = alen-1; apos >= 0; apos--) { if(! esl_abc_XIsGap(abc, tmp_dsq[apos+1])) { epos_ct[apos]++; break; } } } } } nseq++; } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status2 != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "parse failed (line %d): didn't find // at end of alignment", afp->linenumber); /* if we're returning maxgc and an msa, determine maxgc, which we didn't do above b/c we parsed GC lines with parse_gc() * If msa != NULL, we already know maxgc */ if(ret_msa != NULL && opt_maxgc != NULL) { for(i = 0; i < msa->ngc; i++) maxgc = ESL_MAX(maxgc, strlen(msa->gc_tag[i])); if (msa->rf != NULL) maxgc = ESL_MAX(maxgc, 2); /* 2 == strlen("RF") */ if (msa->mm != NULL) maxgc = ESL_MAX(maxgc, 2); /* 2 == strlen("MM") */ if (msa->ss_cons != NULL) maxgc = ESL_MAX(maxgc, 7); /* 7 == strlen("SS_cons") */ if (msa->sa_cons != NULL) maxgc = ESL_MAX(maxgc, 7); /* 7 == strlen("SA_cons") */ if (msa->pp_cons != NULL) maxgc = ESL_MAX(maxgc, 7); /* 7 == strlen("PP_cons") */ } /* same as for maxgc, but now for maxgf */ if(ret_msa != NULL && opt_maxgf != NULL) { for(i = 0; i < msa->ngf; i++) maxgf = ESL_MAX(maxgf, strlen(msa->gf_tag[i])); if (msa->name != NULL) maxgf = ESL_MAX(maxgf, 2); /* 2 == strlen("ID") */ if (msa->desc != NULL) maxgf = ESL_MAX(maxgf, 2); /* 2 == strlen("DE") */ if (msa->acc != NULL) maxgf = ESL_MAX(maxgf, 2); /* 2 == strlen("AC") */ if (msa->au != NULL) maxgf = ESL_MAX(maxgf, 2); /* 2 == strlen("AU") */ } if (first_seqname) free(first_seqname); if (tmp_dsq) free(tmp_dsq); if (ct) free(ct); if (ss_nopseudo) free(ss_nopseudo); if (a2rf_map) free(a2rf_map); if (ret_msa) *ret_msa = msa; else esl_msa_Destroy(msa); if (opt_nseq) *opt_nseq = nseq; if (opt_alen) *opt_alen = alen; if (opt_ngs) *opt_ngs = ngs; if (opt_maxname) *opt_maxname = maxname; if (opt_maxgf) *opt_maxgf = maxgf; if (opt_maxgc) *opt_maxgc = maxgc; if (opt_maxgr) *opt_maxgr = maxgr; if (opt_abc_ct) *opt_abc_ct = abc_ct; else esl_arr2_Destroy((void **) abc_ct, alen); if (opt_pp_ct) *opt_pp_ct = pp_ct; else esl_arr2_Destroy((void **) pp_ct, alen); if (opt_bp_ct) *opt_bp_ct = bp_ct; else esl_arr3_Destroy((void ***) bp_ct, known_alen, abc ? abc->Kp : 0); if (opt_spos_ct) *opt_spos_ct = spos_ct; else esl_free(spos_ct); if (opt_epos_ct) *opt_epos_ct = epos_ct; else esl_free(epos_ct); return eslOK; ERROR: esl_free(first_seqname); esl_free(tmp_dsq); esl_free(ct); esl_free(ss_nopseudo); esl_free(a2rf_map); esl_msa_Destroy(msa); esl_arr2_Destroy((void **) pp_ct, alen); esl_arr2_Destroy((void **) abc_ct, alen); esl_arr3_Destroy((void ***) bp_ct, known_alen, abc ? abc->Kp : 0); esl_free(spos_ct); esl_free(epos_ct); if (ret_msa) *ret_msa = NULL; if (opt_nseq) *opt_nseq = 0; if (opt_alen) *opt_alen = 0; if (opt_ngs) *opt_ngs = 0; if (opt_maxname) *opt_maxname = 0; if (opt_maxgf) *opt_maxgf = 0; if (opt_maxgc) *opt_maxgc = 0; if (opt_maxgr) *opt_maxgr = 0; if (opt_pp_ct) *opt_pp_ct = NULL; if (opt_abc_ct) *opt_abc_ct = NULL; if (opt_bp_ct) *opt_bp_ct = NULL; if (opt_spos_ct) *opt_spos_ct = NULL; if (opt_epos_ct) *opt_epos_ct = NULL; return status; } /* Function: esl_msafile2_RegurgitatePfam() * Synopsis: Read and write next Pfam formatted MSA without storing it. * * Purpose: Read and immediately write each line of a MSA after * optionally modifying aligned data by either adding all * gap columns or removing some columns. Do this without * creating an msa object, so memory usage is minimized. * * The alignment file must be in Pfam format (1 * line/seq non-interleaved Stockholm). The arguments * specify which parts of the alignment to write. * specifies which positions to keep in aligned strings, if * NULL then all are kept. specifies how many gap * characters to add after each aligned position, if NULL * then none are added. Only one of and * can be non-NULL. * * If one of the or keyhashes are * non-NULL, they specify names of sequences (and affiliated * annotation) to output () or not output * (). Only one of these may be non-NULL. * * , , and specify the max * length sequence name, GF tag, GC tag, and GR tag, and can * be provided by a caller that knows their values, e.g. as * revealed by a previous call to . * If any are -1, the caller didn't know the value, and the * spacing in the alignment file we read in will be * preserved. An example of useful non -1 values is if we're * merging multiple alignments into a single alignment, and * the spacing of any given alignment should change when all * alignments are considered (like what the esl-alimerge * miniapp does). * * This function is not as rigorous about validating the * input as the other read functions that store the full * alignment. Here, we verify that there is only one line * for the first sequence read (verifying that all sequences * are only one line would require storing and looking up * all sequence names), that each aligned data line (, * GC, GR) are all the same length. The aligned length must * equal if it is not passed in as -1 (indicating * caller doesn't know alignment length). If or * is non-NULL, must not be -1. * * Aligned sequence residues are not checked to make sure * they are consistent with an alphabet, they are simply * rewritten as they are read from the input file. * * Args: afp - open alignment file pointer * ofp - output file pointer * maxname - maximum length of a sequence name (-1 if unknown) * maxgf - maximum length of a GF tag (-1 if unknown) * maxgc - maximum length of a GC tag (-1 if unknown) * maxgr - maximum length of a GR tag (-1 if unknown) * do_header - TRUE to write magic Stockholm header at top to ofp * do_trailer - TRUE to write '//' at end to ofp * do_blanks - TRUE to regurgitate blank lines, FALSE not to * do_comments - TRUE to write comments to ofp * do_gf - TRUE to write GF annotation to ofp * do_gs - TRUE to write GS annotation to ofp * do_gc - TRUE to write GC annotation to ofp * do_gr - TRUE to write GR annotation to ofp * do_aseq - TRUE to write aligned sequences to ofp * seqs2regurg - keyhash of names of the sequences to write, all others * will not be written. Associated annotation (GS, GR) * will be written for these sequences only. Must be NULL * if seqs2skip is non-NULL (enforced by contract). * If both are NULL all seqs are written. * seqs2skip - keyhash of names of the sequences to skip (not write), * all others will be written. Associated annotation (GS, GR) * will not be written for these sequences. Must be NULL * if seqs2regurg is NULL (enforced by contract). * If both are NULL all seqs are written. * useme - [0..apos..exp_alen-1] TRUE to include position apos in output of * aligned data (GC,GR,aseq), FALSE to remove it, can be NULL * add2me - [0..apos..exp_alen-1] number of all gaps to add after each * position of aligned data (GC,GR,aseq), can be NULL * exp_alen - expected alignment length, -1 if unknown, which * is okay as long as useme == add2me == NULL * gapchar2add - gap character, only relevant if add2me != NULL * opt_nseq_read - RETURN: optional, number of aligned sequences read * opt_nseq_regurged - RETURN: optional, number of aligned sequences regurgitated, * same as opt_nseq_read unless seqs2regurg != NULL or * seqs2skip != NULL. * * Returns: on success. * Returns if there are no more alignments in . * if parse fails because of a file format problem, * in which case errbuf> is set to contain a formatted message * that indicates the cause of the problem. * * Throws: on allocation error. * * Xref: ~nawrockie/notebook/9_1206_esl_msa_mem_efficient/ */ int esl_msafile2_RegurgitatePfam(ESL_MSAFILE2 *afp, FILE *ofp, int maxname, int maxgf, int maxgc, int maxgr, int do_header, int do_trailer, int do_blanks, int do_comments, int do_gf, int do_gs, int do_gc, int do_gr, int do_aseq, ESL_KEYHASH *seqs2regurg, ESL_KEYHASH *seqs2skip, int *useme, int *add2me, int exp_alen, char gapchar2add, int *opt_nseq_read, int *opt_nseq_regurged) { char *s = NULL; int status; int status2; char *seqname = NULL; char *first_seqname = NULL; char *text = NULL; char *gapped_text = NULL; char *tag = NULL; char *gf = NULL; char *gc = NULL; char *gs = NULL; char *gr = NULL; int curmargin, curmargin2, namelen, spacelen, spacelen2, textlen, taglen; int gaps2addlen; int margin; /* width of left hand side margin */ int flushpoint = 10000; /* number of lines read at which to flush ofp */ int nseq_read = 0; int nseq_regurged = 0; /* contract check */ if ( ofp == NULL) ESL_EXCEPTION(eslEINCONCEIVABLE, "ofp is NULL"); if ( afp->format != eslMSAFILE_PFAM) ESL_EXCEPTION(eslEINCONCEIVABLE, "only Pfam (1 line /seq) Stockholm formatted files allowed in small mem mode"); if(add2me != NULL && useme != NULL) ESL_EXCEPTION(eslEINCONCEIVABLE, "add2me and useme both non-NULL"); if((add2me != NULL || useme != NULL) && exp_alen == -1) ESL_EXCEPTION(eslEINCONCEIVABLE, "exp_alen == -1, but add2me or useme non-NULL"); if(seqs2regurg != NULL && seqs2skip != NULL) ESL_EXCEPTION(eslEINVAL, "seqs2regurg and seqs2skip both non-NULL, only one may be"); gaps2addlen = (add2me == NULL) ? 0 : esl_vec_ISum(add2me, (exp_alen+1)); margin = -1; if (maxgf != -1 && maxgf < 2) maxgf = 2; if (maxname != -1) margin = maxname+1; if (maxgc > 0 && maxgc+6 > margin) margin = maxgc+6; if (maxgr > 0 && maxname+maxgr+7 > margin) margin = maxname+maxgr+7; /* if margin is still -1, we'll use the same spacing we read in from the file */ if (feof(afp->f)) { status = eslEOF; goto ERROR; } afp->errbuf[0] = '\0'; /* Check the magic Stockholm header line. * We have to skip blank lines here, else we perceive * trailing blank lines in a file as a format error when * reading in multi-record mode. */ do { if ((status = msafile2_getline(afp)) != eslOK) goto ERROR; /* includes EOF */ } while (is_blankline(afp->buf)); if (strncmp(afp->buf, "# STOCKHOLM 1.", 14) != 0) ESL_XFAIL(eslEFORMAT, afp->errbuf, "parse failed (line %d): missing \"# STOCKHOLM\" header", afp->linenumber); if(do_header) fprintf(ofp, "%s", afp->buf); /* Read the alignment file one line at a time. */ while ((status2 = msafile2_getline(afp)) == eslOK) { if(afp->linenumber % flushpoint == 0) fflush(ofp); s = afp->buf; while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */ if (*s == '#') { if (strncmp(s, "#=GF", 4) == 0) { if (do_gf) { if(maxgf == -1) { /* just print line as is */ fprintf(ofp, "%s", afp->buf); } else { /* parse line into temporary strings, then print it out with correct formatting */ s = afp->buf; if (esl_strtok(&s, " \t\n\r", &gf) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GF line", afp->linenumber); if (esl_strtok(&s, " \t\n\r", &tag) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GF line", afp->linenumber); if (esl_strtok(&s, "\n\r", &text) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GF line", afp->linenumber); fprintf(ofp, "#=GF %-*s %s\n", maxgf, tag, text); } } } else if (strncmp(s, "#=GC", 4) == 0) { if(do_gc) { /* parse line into temporary strings */ s = afp->buf; if (esl_strtok (&s, " \t\n\r", &gc) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &tag, &taglen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line", afp->linenumber); spacelen = determine_spacelen(s); if (esl_strtok_adv(&s, " \t\n\r", &text, &textlen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line", afp->linenumber); /* verify alignment length */ if(exp_alen == -1) exp_alen = textlen; else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", afp->linenumber, textlen, exp_alen); /* determine margin length of sequence name field for output formatting */ curmargin = (margin == -1) ? taglen + spacelen : margin - 6; /* output, after optionally removing some characters (if useme != NULL) or adding gaps (if add2me != NULL) (contract enforces only one can be non-null) */ if(useme != NULL) { /* if this is a GC SS_cons line, remove broken basepairs first - only if it's in WUSS RNA format (NOT for a protein SS!) */ if (strncmp(tag, "SS_cons", 7) == 0 && afp->abc && (afp->abc->type == eslRNA || afp->abc->type == eslDNA)) { if((status = esl_msa_RemoveBrokenBasepairsFromSS(text, afp->errbuf, textlen, useme)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GC SS_cons line", afp->linenumber); } shrink_string(text, useme, exp_alen); /* this is done in place on text */ } if(add2me != NULL) { if((status = gapize_string(text, textlen, textlen + gaps2addlen, add2me, gapchar2add, &gapped_text)) != eslOK) goto ERROR; fprintf(ofp, "#=GC %-*s %s\n", curmargin, tag, gapped_text); free(gapped_text); } else { fprintf(ofp, "#=GC %-*s %s\n", curmargin, tag, text); } } } else if (strncmp(s, "#=GS", 4) == 0) { /* we don't validate the sequence exists, this would require storing all seqnames */ if (do_gs) { if(maxname == -1 && seqs2regurg == NULL) { /* just print line as is */ fprintf(ofp, "%s", afp->buf); } else { /* parse line into temporary strings, then print it out with correct formatting */ if((seqs2regurg == NULL && seqs2skip == NULL) || (seqs2regurg != NULL && (status = esl_keyhash_Lookup(seqs2regurg, seqname, -1, NULL)) == eslOK) || (seqs2skip != NULL && (status = esl_keyhash_Lookup(seqs2skip, seqname, -1, NULL)) == eslENOTFOUND)) { /* this if() will evaluate as TRUE if seqs2regurg and seqs2skip are both NULL, or the seqname exists in seqs2regurg or does not exist in seqs2skip, else it will return FALSE */ s = afp->buf; if (esl_strtok(&s, " \t\n\r", &gs) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GS line", afp->linenumber); if (esl_strtok(&s, " \t\n\r", &seqname) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GS line", afp->linenumber); if (esl_strtok(&s, " \t\n\r", &tag) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GS line", afp->linenumber); if (esl_strtok(&s, "\n\r", &text) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GS line", afp->linenumber); fprintf(ofp, "#=GS %-*s %s %s\n", maxname, seqname, tag, text); } } } } else if (strncmp(s, "#=GR", 4) == 0) { if(do_gr) { /* parse line into temporary strings */ s = afp->buf; if (esl_strtok (&s, " \t\n\r", &gr) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); if (esl_strtok_adv(&s, " \t\n\r", &seqname, &namelen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); spacelen = determine_spacelen(s); if (esl_strtok_adv(&s, " \t\n\r", &tag, &taglen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); spacelen2 = determine_spacelen(s); if (esl_strtok_adv(&s, " \t\n\r", &text, &textlen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line", afp->linenumber); /* verify alignment length */ if(exp_alen == -1) exp_alen = textlen; else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR line, len %d, expected %d", afp->linenumber, textlen, exp_alen); /* determine length of fields for output formatting */ curmargin = (maxname == -1) ? namelen + spacelen : maxname; curmargin2 = (maxname == -1) ? taglen + spacelen2 : margin - maxname - 7; /* determine if we should regurgitate GR for this sequence or not */ if((seqs2regurg == NULL && seqs2skip == NULL) || (seqs2regurg != NULL && (status = esl_keyhash_Lookup(seqs2regurg, seqname, -1, NULL)) == eslOK) || (seqs2skip != NULL && (status = esl_keyhash_Lookup(seqs2skip, seqname, -1, NULL)) == eslENOTFOUND)) { /* this if() will evaluate as TRUE if seqs2regurg and seqs2skip are both NULL, or the seqname exists in seqs2regurg or does not exist in seqs2skip, else it will return FALSE */ /* output GR, after optionally removing some characters (if useme != NULL) or adding gaps (if add2me != NULL) (contract enforces only one can be non-null) */ if(useme != NULL) { /* if this is a GR SS line, remove broken basepairs first */ if( strncmp(tag, "SS", 2) == 0 && afp->abc && (afp->abc->type == eslRNA || afp->abc->type == eslDNA)) { if((status = esl_msa_RemoveBrokenBasepairsFromSS(text, afp->errbuf, textlen, useme)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad #=GR SS line", afp->linenumber); } shrink_string(text, useme, exp_alen); /* this is done in place on text */ } if(add2me != NULL) { if((status = gapize_string(text, textlen, textlen + gaps2addlen, add2me, gapchar2add, &gapped_text)) != eslOK) goto ERROR; fprintf(ofp, "#=GR %-*s %-*s %s\n", curmargin, seqname, curmargin2, tag, gapped_text); free(gapped_text); } else { fprintf(ofp, "#=GR %-*s %-*s %s\n", curmargin, seqname, curmargin2, tag, text); } } } } else if (do_comments) fprintf(ofp, "%s", afp->buf); /* print comment line, if desired */ } /* end of 'if (*s == '#')' */ else if (strncmp(s, "//", 2) == 0) { if(do_trailer) fprintf(ofp, "%s", afp->buf); break; /* normal way out */ } else if (*s == '\n' || *s == '\r') { if(do_blanks) { fprintf(ofp, "%s", afp->buf); } continue; } else { /* sequence line */ if(do_aseq) { /* parse line into temporary strings */ s = afp->buf; if (esl_strtok_adv(&s, " \t\n\r", &seqname, &namelen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad sequence line", afp->linenumber); spacelen = determine_spacelen(s); if (esl_strtok_adv(&s, " \t\n\r", &text, &textlen, NULL) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad sequence line", afp->linenumber); /* verify alignment length */ if((exp_alen != -1) && (exp_alen != textlen)) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): bad seq line, len %d, expected %d", afp->linenumber, textlen, exp_alen); /* determine length of sequence name field for output formatting */ curmargin = (margin == -1) ? namelen + spacelen : margin-1; /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if(nseq_read == 0) { if ((status = esl_strdup(seqname, -1, &(first_seqname))) != eslOK) goto ERROR; } else if(strcmp(first_seqname, seqname) == 0) { ESL_XFAIL(eslEFORMAT, afp->errbuf, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", afp->linenumber, seqname); } nseq_read++; /* determine if we should regurgitate this sequence or not */ if((seqs2regurg == NULL && seqs2skip == NULL) || (seqs2regurg != NULL && (status = esl_keyhash_Lookup(seqs2regurg, seqname, -1, NULL)) == eslOK) || (seqs2skip != NULL && (status = esl_keyhash_Lookup(seqs2skip, seqname, -1, NULL)) == eslENOTFOUND)) { /* this if() will evaluate as TRUE if seqs2regurg and seqs2skip are both NULL, or the seqname exists in seqs2regurg or does not exist in seqs2skip, else it will return FALSE */ /* output sequence, after optionally removing some characters (if useme != NULL) or adding gaps (if add2me != NULL) (contract enforces only one can be non-null) */ nseq_regurged++; if(useme != NULL) shrink_string(text, useme, exp_alen); /* this is done in place on text */ if(add2me != NULL) { if((status = gapize_string(text, textlen, textlen + gaps2addlen, add2me, gapchar2add, &gapped_text)) != eslOK) goto ERROR; fprintf(ofp, "%-*s %s\n", curmargin, seqname, gapped_text); free(gapped_text); } else { fprintf(ofp, "%-*s %s\n", curmargin, seqname, text); } } } } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status2 != eslOK) ESL_XFAIL(eslEFORMAT, afp->errbuf, "small mem parse failed (line %d): didn't find // at end of alignment", afp->linenumber); if (first_seqname != NULL) free(first_seqname); if (opt_nseq_read != NULL) *opt_nseq_read = nseq_read; if (opt_nseq_regurged != NULL) *opt_nseq_regurged = nseq_regurged; return eslOK; ERROR: return status; } /* get_pp_idx * * Given a GR PP or GC PP_cons character, return the appropriate index * in a pp_ct[] vector. * '0' return 0; * '1' return 1; * '2' return 2; * '3' return 3; * '4' return 4; * '5' return 5; * '6' return 6; * '7' return 7; * '8' return 8; * '9' return 9; * '*' return 10; * gap return 11; * * Anything else (including missing or nonresidue) return -1; * * This mapping of PP chars to return values should probably be * stored in some internal map structure somewhere, instead of * only existing in this function as used by esl_msafile2_ReadInfoPfam(). */ static int get_pp_idx(ESL_ALPHABET *abc, char ppchar) { if(esl_abc_CIsGap(abc, ppchar)) return 11; if(ppchar == '*') return 10; if(ppchar == '9') return 9; if(ppchar == '8') return 8; if(ppchar == '7') return 7; if(ppchar == '6') return 6; if(ppchar == '5') return 5; if(ppchar == '4') return 4; if(ppchar == '3') return 3; if(ppchar == '2') return 2; if(ppchar == '1') return 1; if(ppchar == '0') return 0; return -1; } /* gapize_string * * Given a string, create a new one that is a copy of it, * but with gaps added before each position (apos) as specified * by ngapA[0..apos..len]. specifies the gap character * to add. * * ngapA[0] - number of gaps to add before first posn * ngapA[apos] - number of gaps to add before posn apos * ngapA[len] - number of gaps to add after final posn * * ret_str is allocated here. * * Returns eslOK on success. * eslEMEM on memory error. */ static int gapize_string(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str) { int status; int src_apos = 0; int dst_apos = 0; int i; char *dst_str; ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1)); dst_str[dst_len] = '\0'; /* add gaps before first position */ for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = gapchar; /* add gaps after every position */ for(src_apos = 0; src_apos < src_len; src_apos++) { dst_str[dst_apos++] = src_str[src_apos]; for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = gapchar; } *ret_dst_str = dst_str; return eslOK; ERROR: return eslEMEM; } /* shrink_string * * Remove some characters of a string in place. * If useme[0..pos..len-1] == FALSE, remove position pos. * */ static void shrink_string(char *str, const int *useme, int len) { int opos, npos; for (opos = 0, npos = 0; opos < len; opos++) { if (useme[opos] == FALSE) continue; str[npos++] = str[opos]; } str[npos] = '\0'; return; } /* determine_spacelen * * Determine number of consecutive ' ' characters * in the string pointed to by s. */ static int determine_spacelen(char *s) { int spacelen = 0; while (*s == ' ') { spacelen++; s++; } return spacelen; } /*------------- end, memory efficient Pfam i/o -----------------*/ /***************************************************************** * 3. Legacy Stockholm parsing tools *****************************************************************/ /* msafile_getline(): * load the next line of into buf>. * Returns eslOK on success, eslEOF on normal eof. * Throws eslEMEM on alloc failure. */ static int msafile2_getline(ESL_MSAFILE2 *afp) { int status; status = esl_fgets(&(afp->buf), &(afp->buflen), afp->f); afp->linenumber++; return status; } static int is_blankline(char *s) { for (; *s != '\0'; s++) if (! isspace((int) *s)) return FALSE; return TRUE; } /* Format of a GF line: * #=GF * Returns eslOK on success; eslEFORMAT on parse failure. * Throws eslEMEM on allocation failure. */ static int parse_gf(ESL_MSA *msa, char *buf) { char *gf; char *tag; char *text; char *tok; char *s; int n; int status; s = buf; if (esl_strtok(&s, " \t\n\r", &gf) != eslOK) return eslEFORMAT; if (esl_strtok(&s, " \t\n\r", &tag) != eslOK) return eslEFORMAT; /* special case: accept a blank #=GF CC line. Otherwise, is required. */ while (isspace(*s)) s++; status = esl_strtok_adv(&s, "\n\r", &text, &n, NULL); if (status == eslEOL) { if (strcmp(tag, "CC") != 0) return eslEFORMAT; } else if (status != eslOK) { return eslEFORMAT; } if (strcmp(tag, "ID") == 0) status = esl_strdup(text, n, &(msa->name)); else if (strcmp(tag, "AC") == 0) status = esl_strdup(text, n, &(msa->acc)); else if (strcmp(tag, "DE") == 0) status = esl_strdup(text, n, &(msa->desc)); else if (strcmp(tag, "AU") == 0) status = esl_strdup(text, n, &(msa->au)); else if (strcmp(tag, "GA") == 0) { /* Pfam has GA1, GA2. Rfam just has GA1. */ s = text; if ((esl_strtok(&s, " \t\n\r", &tok)) != eslOK) return eslEFORMAT; msa->cutoff[eslMSA_GA1] = atof(tok); msa->cutset[eslMSA_GA1] = TRUE; if ((esl_strtok(&s, " \t\n\r", &tok)) == eslOK) { msa->cutoff[eslMSA_GA2] = atof(tok); msa->cutset[eslMSA_GA2] = TRUE; } status = eslOK; } else if (strcmp(tag, "NC") == 0) { s = text; if ((esl_strtok(&s, " \t\n\r", &tok)) != eslOK) return eslEFORMAT; msa->cutoff[eslMSA_NC1] = atof(tok); msa->cutset[eslMSA_NC1] = TRUE; if ((esl_strtok(&s, " \t\n\r", &tok)) == eslOK) { msa->cutoff[eslMSA_NC2] = atof(tok); msa->cutset[eslMSA_NC2] = TRUE; } status = eslOK; } else if (strcmp(tag, "TC") == 0) { s = text; if ((esl_strtok(&s, " \t\n\r", &tok)) != eslOK) return eslEFORMAT; msa->cutoff[eslMSA_TC1] = atof(tok); msa->cutset[eslMSA_TC1] = TRUE; if ((esl_strtok(&s, "\t\n\r", &tok)) == eslOK) { msa->cutoff[eslMSA_TC2] = atof(tok); msa->cutset[eslMSA_TC2] = TRUE; } status = eslOK; } else if (strcmp(tag, "CC") == 0 && text == NULL) status = esl_msa_AddGF (msa, tag, -1, "", -1); else status = esl_msa_AddGF(msa, tag, -1, text, -1); return status; } /* parse_gc(): * Format of a GC line: * #=GC */ static int parse_gc(ESL_MSA *msa, char *buf) { char *gc; char *tag; char *text; char *s; int len; int status; s = buf; if (esl_strtok (&s, " \t\n\r", &gc) != eslOK) return eslEFORMAT; if (esl_strtok (&s, " \t\n\r", &tag) != eslOK) return eslEFORMAT; if (esl_strtok_adv(&s, " \t\n\r", &text, &len, NULL) != eslOK) return eslEFORMAT; if (strcmp(tag, "SS_cons") == 0) status = esl_strcat(&(msa->ss_cons), -1, text, len); else if (strcmp(tag, "SA_cons") == 0) status = esl_strcat(&(msa->sa_cons), -1, text, len); else if (strcmp(tag, "PP_cons") == 0) status = esl_strcat(&(msa->pp_cons), -1, text, len); else if (strcmp(tag, "RF") == 0) status = esl_strcat(&(msa->rf), -1, text, len); else if (strcmp(tag, "MM") == 0) status = esl_strcat(&(msa->mm), -1, text, len); else status = esl_msa_AppendGC(msa, tag, text); return status; } /* parse_comment(): * comments are simply stored verbatim, not parsed */ static int parse_comment(ESL_MSA *msa, char *buf) { char *s; char *comment; s = buf + 1; /* skip leading '#' */ if (*s == '\n' || *s == '\r') { *s = '\0'; comment = s; } /* deal with blank comment */ else if (esl_strtok(&s, "\n\r", &comment)!= eslOK) return eslEFORMAT; return (esl_msa_AddComment(msa, comment, -1)); } /*-------------- end, legacy stockholm parser -------------------*/ /***************************************************************** * 4. Unit tests *****************************************************************/ #ifdef eslMSAFILE2_TESTDRIVE /* write_known_pfam_msa() * Write a known MSA to a tmpfile in Pfam Stockholm format. */ static void write_known_pfam_msa(FILE *ofp) { fprintf(ofp, "# STOCKHOLM 1.0\n"); fprintf(ofp, "#=GF ID pfam-test\n"); fprintf(ofp, "#=GS seq2 DE seq2 is interesting\n"); fprintf(ofp, "seq1 --ACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWY~~~\n"); fprintf(ofp, "seq2 aaACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWYyyy\n"); fprintf(ofp, "seq3 aaACDEFGHIK~LMNPQRS-TVWYACDEFGHIKLMNPQRSTVWYyyy\n"); fprintf(ofp, "#=GC RF ..xxxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...\n"); fprintf(ofp, "//\n"); return; } static void utest_ReadInfoPfam(char *filename) { char *msg = "ReadInfo() unit test failure"; ESL_MSAFILE2 *mfp = NULL; ESL_MSA *msa = NULL; int nseq = 0; int64_t alen = 0; int ngs = 0; int maxname = 0; int maxgf = 0; int maxgc = 0; int maxgr = 0; if (esl_msafile2_Open(filename, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_msafile2_ReadInfoPfam(mfp, NULL, NULL, -1, NULL, NULL, &msa, &nseq, &alen, &ngs, &maxname, &maxgf, &maxgc, &maxgr, NULL, NULL, NULL, NULL, NULL) != eslOK) esl_fatal(msg); if (msa->nseq != 0) esl_fatal("bad msa->nseq"); if (msa->alen != -1) esl_fatal("bad msa->alen"); if (nseq != 3) esl_fatal("bad nseq"); if (alen != 47) esl_fatal("bad alen"); if (ngs != 1) esl_fatal("bad ngs"); if (maxname != 4) esl_fatal("bad maxname"); if (maxgf != 2) esl_fatal("bad maxgf"); if (maxgc != 2) esl_fatal("bad maxgc"); if (maxgr != 0) esl_fatal("bad maxgr"); esl_msa_Destroy(msa); if (esl_msafile2_ReadInfoPfam(mfp, NULL, NULL, -1, NULL, NULL, &msa, &nseq, &alen, &ngs, &maxname, &maxgf, &maxgc, &maxgr, NULL, NULL, NULL, NULL, NULL) != eslEOF) esl_fatal(msg); if (msa != NULL) esl_fatal(msg); if (nseq != 0 || alen != 0 || ngs != 0 || maxname != 0 || maxgf != 0 || maxgc != 0 || maxgr != 0) esl_fatal("bad nseq"); esl_msafile2_Close(mfp); } static void utest_RegurgitatePfam(char *filename) { char *msg = "RegurgitatePfam() unit test failure"; ESL_MSAFILE2 *mfp = NULL; ESL_MSAFILE *afp = NULL; char tmpfile[16] = "esltmpXXXXXX"; FILE *fp = NULL; ESL_MSA *msa1 = NULL; ESL_MSA *msa2 = NULL; /* regurgitate msa in filename to tmpfile (an msa structure will not be created) */ if (esl_msafile2_Open(filename, NULL, &mfp) != eslOK) esl_fatal(msg); if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if (esl_msafile2_RegurgitatePfam(mfp, fp, -1, -1, -1, -1, /* maxname, maxgf, maxgc, maxgr unknown: output msa formatting will match input msa formatting */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* do_header, do_trailer, do_blanks, do_comments, do_gf, do_gs, do_gc, do_gr, do_aseq: print all components */ NULL, /* seqs2regurg: if non-NULL specifies which sequences to keep in output */ NULL, /* seqs2skip: if non-NULL specifies which sequences to skip in output */ NULL, /* useme: if non-NULL specifies which columns to keep in output */ NULL, /* add2me: if non-NULL specifies how many gap columns to add in output */ -1, /* expected alignment length, unknown (must not be if useme != NULL or add2me != NULL */ '.', /* gapchar2add, irrelevant since add2me is NULL */ NULL, /* don't return num seqs read */ NULL) /* don't return num seqs read */ != eslOK) esl_fatal(msg); fclose(fp); esl_msafile2_Close(mfp); /* Using normal interface, read in msa from filename as msa1 */ if (esl_msafile_Open(NULL, filename, NULL, eslMSAFILE_PFAM, NULL, &afp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(afp, &msa1) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); /* Using normal interface, read in msa from tmpfile as msa2 */ if (esl_msafile_Open(NULL, tmpfile, NULL, eslMSAFILE_PFAM, NULL, &afp) != eslOK) esl_fatal(msg); if (esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); remove(tmpfile); } #endif /* eslMSAFILE2_TESTDRIVE */ /*------------------ end, unit tests ----------------------------*/ /***************************************************************** * 5. Test driver *****************************************************************/ #ifdef eslMSAFILE2_TESTDRIVE /* compile: gcc -g -Wall -I. -L. -o esl_msafile2_utest -DeslMSAFILE2_TESTDRIVE esl_msafile2.c -leasel -lm * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile2_utest -DeslMSAFILE2_TESTDRIVE esl_msafile2.c -leasel -lm * run: ./esl_msafile2_utest */ #include "esl_config.h" #include #include "easel.h" #include "esl_getopts.h" #include "esl_msafile2.h" static ESL_OPTIONS options[] = { /* name type default env range togs reqs incomp help docgrp */ {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0}, { 0,0,0,0,0,0,0,0,0,0}, }; static char usage[] = "[-options]"; static char banner[] = "test driver for legacy memory-efficient Pfam format input/output"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); FILE *fp = NULL; char tmpfile[16] = "esltmpXXXXXX"; if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal("failed to create tmpfile"); write_known_pfam_msa(fp); fclose(fp); utest_ReadInfoPfam(tmpfile); utest_RegurgitatePfam(tmpfile); remove(tmpfile); esl_getopts_Destroy(go); exit(0); } #endif /*eslMSAFILE2_TESTDRIVE*/ /*----------------- end, test driver ----------------------------*/