/* Multiple sequence alignment file i/o * * Table of contents: * 1. Opening/closing an ESL_MSAFILE. * 2. ESL_MSAFILE_FMTDATA: optional added constraints on formats. * 3. Guessing file formats. * 4. Guessing alphabets. * 5. Random MSA flatfile access. * 6. Reading an MSA from an ESL_MSAFILE. * 7. Writing an MSA to a stream. * 8. MSA functions that depend on MSAFILE * 9. Utilities used by specific format parsers. * 10. Unit tests. * 11. Test driver. * 12. Examples. */ #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_buffer.h" #include "esl_mem.h" #include "esl_msa.h" #include "esl_ssi.h" #include "esl_msafile.h" /***************************************************************** *# 1. Opening/closing an ESL_MSAFILE *****************************************************************/ static int msafile_Create (ESL_MSAFILE **ret_afp); static int msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE *afp); /* Function: esl_msafile_Open() * Synopsis: Open a multiple sequence alignment file for input. * * Purpose: Open a multiple sequence alignment file for input. * Return an open handle in <*ret_afp>. * * is usually the name of a file. Alignments may * also be read from standard input, or from * gzip-compressed files. If is ``-'', alignment * input is taken from the standard input stream. If * ends in ``.gz'', alignment input is read * through a pipe from . * * , , , and support a variety * of optional/advanced operations, as described * below. Minimally, a caller can set to , * to , and to , * and will be opened in text mode; in the * current working directory; and its format will be * autodetected. * * The argument controls whether data are to be * read in text or digital mode. In digital mode, alignment * data are immediately digitized into an Easel internal * alphabet (which among other things, allows various * things to operate on sequence data more efficiently) and * because an expected alphabet is known, parsers are able * to detect invalid characters. The caller may either * provide an alphabet (thus asserting what it's expected * to be), or have look at the file * and guess what alphabet it appears to be (DNA or amino * acid code, usually). In text mode, alignment data are * read verbatim. It might be advantageous for an * application to read in text mode -- for example, if a * variant alignment format is using characters in some * special way, and you need to deal with them specially. * All this goes through the setting of the passed-by-reference * alphabet pointer . If caller passes NULL for * the argument, input is in text mode. If caller * provides a valid non-NULL pointer but * <*byp_abc> is NULL (that is, caller has declared * and passed <&abc> as an * argument), then we attempt to guess the digital alphabet * using , based on the first * alignment in the input. In this case, the new alphabet * is allocated here and returned to the caller. If caller * provides a digital alphabet (that is, and passed <&abc>), that's * the alphabet we use. * * The argument controls where we search for the * . If is , only the current working * directory is checked. Optionally, caller can provide in * the name of an environment variable ("PFAMDB", * perhaps), in which the routine can find a * colon-delimited list of directories. Then, if * is not found in the current working directory, we look * for it in these directories, in the order they're * listed. * * The argument allows the caller to either allow * to autodetect the file format of * , or to assert that it knows the file is in a * particular format. If is , * format autodetection is performed. Other valid codes include: * | | Stockholm format | * | | Aligned FASTA format | * | | Clustal format (strict) | * | | Clustal-like (MUSCLE, PROBCONS...) | * | | PHYLIP interleaved format | * | | PHYLIP sequential format | * | | UCSC SAM A2M (dotless or dotful) | * | | NCBI PSI-BLAST | * | | a general alignment block format | * * The argument is an optional pointer to a * structure that the caller may * initialize and provide, in order to assert any * additional unusual constraints on the input format -- * for example, to dictate that a PHYLIP format file must * have some nonstandard name field width. Generally, * though, will be , and such things would * either be autodetected as part of the autodetected * format, or the strict version of the parser will be * used. (That is, if you open a PHYLIP file with * and a , the * format autodetector is used, and it will automagically * detect nonstandard PHYLIP namewidths != 10. But if you * open a PHYLIP file with and a * , then you get the strict interleaved PHYLIP * parser, which requires a name width of exactly 10.) * * Args: byp_abc - digital alphabet to use, or NULL for text mode * if <*byp_abc> is NULL, guess the digital alphabet, * create it, and return it in <*byp_abc>. * If <*byp_abc> is a digital alphabet, use it. * msafile - name of alignment input to open; * if "-", read standard input; * if "*.gz", read through a pipe. * env - , or the name of an environment variable * containing colon-delimited list of directories * in which to search for (e.g. "PFAMDB"). * format - format code, such as ; * or to autodetect format. * fmtd - , or a pointer to an initialized * structure, containing * any additional unusual constraints to apply * to the input format. * *ret_afp - RETURN: open MSA input stream. * * Returns: on success, and <*ret_afp> is the newly opened msa file. * * if doesn't exist or can't be * opened for reading; or (in the case of a <.gz> file) if * a executable doesn't exist in user's or * can't be executed. errmsg> is something like * "couldn't open %s for reading", with <%s> being the * name of the msafile. * * if we tried to autodetect the file format * (caller provided ), and * failed. errmsg> is an informative user-directed * message, minimally something like "couldn't determine alignment * input format", possibly more detailed. * * if we tried to autodetect the alphabet * (caller provided <&abc>, to request digital * mode w/ alphabet autodetection) but the alphabet could * not be reliably guessed. * * in the case of a <.gz> file and the * command fails on it. * * On any of these normal errors, <*ret_afp> is returned in * an error state, containing a user-directed error message * in errmsg> and (if relevant) the full path to * that we attempted to open in * bf->filename>. See for * a function that gives a standard way of reporting these * diagnostics to . * * Throws: on allocation failure. * on a system call failure, such as . * if we tried to use but the stream was * invalid (in an error state, , or at ). * On thrown exceptions, <*ret_afp> is . */ int esl_msafile_Open(ESL_ALPHABET **byp_abc, const char *msafile, const char *env, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp) { ESL_MSAFILE *afp = NULL; int status; if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR; if ((status = esl_buffer_Open(msafile, env, &(afp->bf))) != eslOK) ESL_XFAIL(status, afp->errmsg, "%s", afp->bf->errmsg); /* ENOTFOUND; FAIL are normal here */ if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR; *ret_afp = afp; return eslOK; ERROR: /* on normal errors, afp is returned in an error state */ if (status == eslENOTFOUND || status == eslFAIL || status == eslENOFORMAT || status == eslENODATA || status == eslENOALPHABET) { afp->abc = NULL; *ret_afp = afp;} else { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; } return status; } /* Function: esl_msafile_OpenMem() * Synopsis: Open a string or buffer for parsing as an MSA. * * Purpose: Essentially the same as , except * we ``open'' the string or buffer

, as the * input source. * * If

is a NUL-terminated string, providing its length * is optional; may be passed as -1. If

is an * unterminated buffer, providing the length is * mandatory. */ int esl_msafile_OpenMem(ESL_ALPHABET **byp_abc, const char *p, esl_pos_t n, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp) { ESL_MSAFILE *afp = NULL; int status; if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR; if ( (status = esl_buffer_OpenMem(p, n, &(afp->bf))) != eslOK) goto ERROR; if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR; *ret_afp = afp; return eslOK; ERROR: if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET) { afp->abc = NULL; *ret_afp = afp;} else { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; } return status; } /* Function: esl_msafile_OpenBuffer() * Synopsis: Open an input buffer for parsing as an MSA. * * Purpose: Essentially the same as , except * we ``open'' an that's already been * opened by the caller for some input source. */ int esl_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp) { ESL_MSAFILE *afp = NULL; int status; if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR; afp->bf = bf; if ((status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR; *ret_afp = afp; return eslOK; ERROR: if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET) { afp->abc = NULL; *ret_afp = afp;} else { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; } return status; } /* Function: esl_msafile_OpenFailure() * Synopsis: Report diagnostics of normal error in opening MSA file, and exit. * * Purpose: Report user-directed diagnostics of a normal error in opening * an MSA input. Print a message to , then exit. */ void esl_msafile_OpenFailure(ESL_MSAFILE *afp, int status) { int show_source = FALSE; int show_fmt = FALSE; fprintf(stderr, "Alignment input open failed.\n"); if (status == eslENOTFOUND) { fprintf(stderr, " %s\n", afp->errmsg); } else if (status == eslFAIL) { fprintf(stderr, " %s\n", afp->errmsg); } else if (status == eslENOFORMAT) { fprintf(stderr, " %s\n", afp->errmsg); show_source = TRUE; } else if (status == eslENOALPHABET) { fprintf(stderr, " %s\n", afp->errmsg); show_source = TRUE; show_fmt = TRUE; } else if (status == eslEMEM) { fprintf(stderr, " Memory allocation failure\n"); } else if (status == eslESYS) { fprintf(stderr, " System call failed, possibly fread()\n"); } else { fprintf(stderr, " Unexpected error code %d\n", status); } if (show_source) { switch (afp->bf->mode_is) { case eslBUFFER_STREAM: fprintf(stderr, " while reading from an input stream (not a file)\n"); break; case eslBUFFER_CMDPIPE: fprintf(stderr, " while reading through a pipe (not a file)\n"); break; case eslBUFFER_FILE: case eslBUFFER_ALLFILE: case eslBUFFER_MMAP: fprintf(stderr, " while reading file %s\n", afp->bf->filename); break; case eslBUFFER_STRING: fprintf(stderr, " while reading from a provided string (not a file)\n"); break; default: break; } } if (show_fmt) { fprintf(stderr, " while parsing for %s format\n", esl_msafile_DecodeFormat(afp->format)); } esl_msafile_Close(afp); exit(status); } /* Function: esl_msafile_SetDigital() * Synopsis: Convert an open text-mode ESL_MSAFILE to digital mode. * * Purpose: Convert the open from text mode to digital mode, * using alphabet . * * Note: This function is only here for legacy support: it's * called by esl_sqio, which still uses an outdated * open / guess alphabet / set digital pattern. When * sqio is upgraded next, this function should be removed. */ int esl_msafile_SetDigital(ESL_MSAFILE *afp, const ESL_ALPHABET *abc) { int status; afp->abc = abc; switch (afp->format) { case eslMSAFILE_A2M: status = esl_msafile_a2m_SetInmap( afp); break; case eslMSAFILE_AFA: status = esl_msafile_afa_SetInmap( afp); break; case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_SetInmap( afp); break; case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_SetInmap( afp); break; case eslMSAFILE_PFAM: status = esl_msafile_stockholm_SetInmap(afp); break; case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_SetInmap( afp); break; case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_SetInmap( afp); break; case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_SetInmap( afp); break; case eslMSAFILE_SELEX: status = esl_msafile_selex_SetInmap( afp); break; case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_SetInmap(afp); break; default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such alignment file format"); } return status; } /* Function: esl_msafile_Close() * Synopsis: Close an open . */ void esl_msafile_Close(ESL_MSAFILE *afp) { if (afp) { if (afp->bf) esl_buffer_Close(afp->bf); if (afp->ssi) esl_ssi_Close(afp->ssi); free(afp); } } static int msafile_Create(ESL_MSAFILE **ret_afp) { ESL_MSAFILE *afp = NULL; int status; ESL_ALLOC(afp, sizeof(ESL_MSAFILE)); afp->bf = NULL; afp->line = NULL; afp->n = 0; afp->linenumber = 0; afp->lineoffset = 0; afp->format = eslMSAFILE_UNKNOWN; afp->abc = NULL; afp->ssi = NULL; afp->errmsg[0] = '\0'; esl_msafile_fmtdata_Init(&(afp->fmtd)); *ret_afp = afp; return eslOK; ERROR: *ret_afp = NULL; return status; } /* All input sources funnel through here. * Here, is already allocated and initialized, and the input * is opened successfully. */ static int msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE *afp) { ESL_ALPHABET *abc = NULL; int alphatype = eslUNKNOWN; int status; /* if caller provided , copy it into afp->fmtd */ if (fmtd) esl_msafile_fmtdata_Copy(fmtd, &(afp->fmtd)); /* Determine the format */ if (format == eslMSAFILE_UNKNOWN && (status = esl_msafile_GuessFileFormat(afp->bf, &format, &(afp->fmtd), afp->errmsg)) != eslOK) goto ERROR; afp->format = format; /* Determine the alphabet; set . ( == NULL means text mode.) */ /* Note that GuessAlphabet() functions aren't allowed to use the inmap, because it isn't set yet */ if (byp_abc && *byp_abc) /* Digital mode, and caller provided the alphabet */ { abc = *byp_abc; alphatype = abc->type; } else if (byp_abc) /* Digital mode, and caller wants us to guess and create an alphabet */ { status = esl_msafile_GuessAlphabet(afp, &alphatype); if (status == eslENOALPHABET) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "couldn't guess alphabet (maybe try --dna/--rna/--amino if available)"); else if (status != eslOK) goto ERROR; if ( (abc = esl_alphabet_Create(alphatype)) == NULL) { status = eslEMEM; goto ERROR; } } afp->abc = abc; /* with afp->abc set, the inmap config functions know whether to do digital/text */ /* Configure the format-specific, digital or text mode character * input map in afp->inmap. * All of these must: * * set inmap[0] to an appropriate 'unknown' character, to replace * invalid input with. * set ' ' to eslDSQ_IGNORE (if we're supposed to accept and skip * it), or map it to a gap, or set it as eslDSQ_ILLEGAL. * in digital mode, copy the abc->inmap * in text mode, decide if we should accept most any * non-whitespace character (isgraph()), or if the format is * inherently restrictive and we should go with isalpha() + * some other valid characters "_-.~*" instead. */ switch (afp->format) { case eslMSAFILE_A2M: if ((status = esl_msafile_a2m_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_AFA: if ((status = esl_msafile_afa_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_CLUSTAL: if ((status = esl_msafile_clustal_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_CLUSTALLIKE: if ((status = esl_msafile_clustal_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_PFAM: if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break; case eslMSAFILE_PHYLIP: if ((status = esl_msafile_phylip_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_PHYLIPS: if ((status = esl_msafile_phylip_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_PSIBLAST: if ((status = esl_msafile_psiblast_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_SELEX: if ((status = esl_msafile_selex_SetInmap( afp)) != eslOK) goto ERROR; break; case eslMSAFILE_STOCKHOLM: if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break; default: ESL_XEXCEPTION(eslENOFORMAT, "no such alignment file format"); } if (esl_byp_IsReturned(byp_abc)) *byp_abc = abc; return eslOK; ERROR: /* on normal errors, afp is returned in an error state */ if (abc && ! esl_byp_IsProvided(byp_abc)) { esl_alphabet_Destroy(abc); } if (esl_byp_IsReturned(byp_abc)) *byp_abc = NULL; afp->abc = NULL; return status; } /*------------- end, open/close an ESL_MSAFILE -----------------*/ /***************************************************************** *# 2. ESL_MSAFILE_FMTDATA: optional extra constraints on formats. *****************************************************************/ /* Function: esl_msafile_fmtdata_Init() * Synopsis: Initialize a structure. */ int esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA *fmtd) { fmtd->namewidth = 0; fmtd->rpl = 0; return eslOK; } /* Function: esl_msafile_fmtdata_Copy() * Synopsis: Copy one structure to another. */ int esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA *src, ESL_MSAFILE_FMTDATA *dst) { dst->namewidth = src->namewidth; dst->rpl = src->rpl; return eslOK; } /*--------------- ESL_MSAFILE_FMTDATA --------------------------*/ /***************************************************************** *# 3. Guessing file format. *****************************************************************/ static int msafile_check_selex (ESL_BUFFER *bf); /* Function: esl_msafile_GuessFileFormat() * Synopsis: Guess the MSA file format of an open buffer. * * Purpose: Peek into an open buffer, and try to determine what * alignment file format (if any) its input is in. If a * format can be determined, return and set * <*ret_fmtcode> to the format code. If not, return * and set <*ret_fmtcode> to * . In either case, the buffer is * restored to its original position upon return. * * Some formats may have variants that require special * handling. Caller may pass a pointer <*opt_fmtd> to a * structure to capture this * information from format autodetection, or . If the * structure is provided, it is reinitialized, and any * fields that can be determined by the appropriate * format-guessing function are filled in. If <*opt_fmtd> * is , the range of variation that can be captured * by some formats may be limited. Currently this only * affects PHYLIP format, where namewidth> * allows files with nonstandard name field widths to be * autodetected and parsed. * * Args: bf - the open buffer to read input from * ret_fmtcode - RETURN: format code that's determined * opt_fmtd - optRETURN: ptr to an structure to * be filled in with additional format-specific data, or * errbuf - optRETURN: space for an informative error message if * format autodetection fails; caller allocates for bytes. * * Returns: on success, and <*ret_fmtcode> contains the format code. * * if format can't be guessed; now <*ret_fmtcode> contains * , and optional contains an explanation. * * Throws: (no abnormal error conditions) */ int esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDATA *opt_fmtd, char *errbuf) { esl_pos_t initial_offset; char *p; esl_pos_t n; int fmt_bysuffix = eslMSAFILE_UNKNOWN; int fmt_byfirstline = eslMSAFILE_UNKNOWN; int status; /* Initialize the optional data, if provided (move this initialization to a function someday) */ if (opt_fmtd) esl_msafile_fmtdata_Init(opt_fmtd); if (errbuf) errbuf[0] = '\0'; /* As we start, save parser status: * remember the offset where we started (usually 0, but not necessarily) * set an anchor to be sure that this offset stays in the buffer's memory */ initial_offset = esl_buffer_GetOffset(bf); esl_buffer_SetAnchor(bf, initial_offset); /* We may use a filename suffix as a clue, especially when * distinguishing formats that may not be distinguishable. * (if there's a filename, and if it has a suffix, anyway.) */ if (bf->filename) { esl_file_Extension(bf->filename, 0, &p, &n); if (esl_memstrcmp(p, n, ".gz")) esl_file_Extension(bf->filename, 3, &p, &n); if (p) { if (esl_memstrcmp(p, n, ".sto")) fmt_bysuffix = eslMSAFILE_STOCKHOLM; else if (esl_memstrcmp(p, n, ".sth")) fmt_bysuffix = eslMSAFILE_STOCKHOLM; else if (esl_memstrcmp(p, n, ".stk")) fmt_bysuffix = eslMSAFILE_STOCKHOLM; else if (esl_memstrcmp(p, n, ".afa")) fmt_bysuffix = eslMSAFILE_AFA; else if (esl_memstrcmp(p, n, ".afasta")) fmt_bysuffix = eslMSAFILE_AFA; else if (esl_memstrcmp(p, n, ".pfam")) fmt_bysuffix = eslMSAFILE_PFAM; else if (esl_memstrcmp(p, n, ".a2m")) fmt_bysuffix = eslMSAFILE_A2M; else if (esl_memstrcmp(p, n, ".slx")) fmt_bysuffix = eslMSAFILE_SELEX; else if (esl_memstrcmp(p, n, ".selex")) fmt_bysuffix = eslMSAFILE_SELEX; else if (esl_memstrcmp(p, n, ".pb")) fmt_bysuffix = eslMSAFILE_PSIBLAST; else if (esl_memstrcmp(p, n, ".ph")) fmt_bysuffix = eslMSAFILE_PHYLIP; else if (esl_memstrcmp(p, n, ".phy")) fmt_bysuffix = eslMSAFILE_PHYLIP; else if (esl_memstrcmp(p, n, ".phyi")) fmt_bysuffix = eslMSAFILE_PHYLIP; else if (esl_memstrcmp(p, n, ".phys")) fmt_bysuffix = eslMSAFILE_PHYLIPS; } } /* We peek at the first non-blank line of the file. * Multiple sequence alignment files are often identifiable by a token on this line. */ /* Skip blank lines, get first non-blank one */ do { status = esl_buffer_GetLine(bf, &p, &n); } while (status == eslOK && esl_memspn(p, n, " \t") == n); if (status == eslEOF) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess alignment input format: empty file/no data"); if (esl_memstrpfx(p, n, "# STOCKHOLM")) fmt_byfirstline = eslMSAFILE_STOCKHOLM; else if (esl_memstrpfx(p, n, ">")) fmt_byfirstline = eslMSAFILE_AFA; else if (esl_memstrpfx(p, n, "CLUSTAL")) fmt_byfirstline = eslMSAFILE_CLUSTAL; else if (esl_memstrcontains(p, n, "multiple sequence alignment")) fmt_byfirstline = eslMSAFILE_CLUSTALLIKE; else { char *tok; esl_pos_t toklen; /* look for , characteristic of PHYLIP files */ if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK && esl_memspn(tok, toklen, "0123456789") == toklen && esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK && esl_memspn(tok, toklen, "0123456789") == toklen) fmt_byfirstline = eslMSAFILE_PHYLIP; /* interleaved for now; we'll look more closely soon */ } /* Restore parser status, rewind to start */ esl_buffer_SetOffset (bf, initial_offset); esl_buffer_RaiseAnchor(bf, initial_offset); /* Rules to determine formats. * If we have to call a routine that looks deeper into the buffer * than the first line, that routine must restore buffer status. */ if (fmt_byfirstline == eslMSAFILE_STOCKHOLM) { if (fmt_bysuffix == eslMSAFILE_STOCKHOLM) *ret_fmtcode = eslMSAFILE_STOCKHOLM; else if (fmt_bysuffix == eslMSAFILE_PFAM) *ret_fmtcode = eslMSAFILE_PFAM; else *ret_fmtcode = eslMSAFILE_STOCKHOLM; } else if (fmt_byfirstline == eslMSAFILE_CLUSTAL) { *ret_fmtcode = eslMSAFILE_CLUSTAL; } else if (fmt_byfirstline == eslMSAFILE_CLUSTALLIKE) { *ret_fmtcode = eslMSAFILE_CLUSTALLIKE; } else if (fmt_byfirstline == eslMSAFILE_AFA) { if (fmt_bysuffix == eslMSAFILE_A2M) *ret_fmtcode = eslMSAFILE_A2M; // A2M requires affirmative identification by caller, else *ret_fmtcode = eslMSAFILE_AFA; // because of ambiguity w/ AFA. } else if (fmt_byfirstline == eslMSAFILE_PHYLIP) { if (fmt_bysuffix == eslMSAFILE_PHYLIP) *ret_fmtcode = eslMSAFILE_PHYLIP; else if (fmt_bysuffix == eslMSAFILE_PHYLIPS) *ret_fmtcode = eslMSAFILE_PHYLIPS; else { int namewidth; status = esl_msafile_phylip_CheckFileFormat(bf, ret_fmtcode, &namewidth); if (status == eslEAMBIGUOUS) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess format: it's consistent w/ both phylip, phylips."); else if (status == eslFAIL) ESL_XFAIL(eslENOFORMAT, errbuf, "format unrecognized, though it looks phylip-like"); if (opt_fmtd) opt_fmtd->namewidth = namewidth; else if (namewidth != 10) ESL_XFAIL(eslENOFORMAT, errbuf, "can't parse nonstandard PHYLIP name width (expected 10)"); /* if we can't store the nonstandard width, we can't allow the caller to think it can parse this */ } } else // if we haven't guessed so far, try selex. { /* selex parser can handle psiblast too */ if (fmt_bysuffix == eslMSAFILE_SELEX) *ret_fmtcode = eslMSAFILE_SELEX; else if (msafile_check_selex(bf) == eslOK) { if (fmt_bysuffix == eslMSAFILE_PSIBLAST) *ret_fmtcode = eslMSAFILE_PSIBLAST; else *ret_fmtcode = eslMSAFILE_SELEX; } else ESL_XFAIL(eslENOFORMAT, errbuf, "couldn't guess alignment input format - doesn't even look like selex"); } return eslOK; ERROR: *ret_fmtcode = eslMSAFILE_UNKNOWN; return status; } /* Function: esl_msafile_IsMultiRecord() * Synopsis: Test if a format supports multiple MSAs per file. * * Purpose: Return if MSA file format supports * more than one MSA record per file. Return * otherwise (including the cases of being * invalid or ). */ int esl_msafile_IsMultiRecord(int fmt) { switch (fmt) { case eslMSAFILE_UNKNOWN: return FALSE; case eslMSAFILE_STOCKHOLM: return TRUE; case eslMSAFILE_PFAM: return TRUE; case eslMSAFILE_A2M: return FALSE; case eslMSAFILE_PSIBLAST: return FALSE; case eslMSAFILE_SELEX: return FALSE; case eslMSAFILE_AFA: return FALSE; case eslMSAFILE_CLUSTAL: return FALSE; case eslMSAFILE_CLUSTALLIKE: return FALSE; case eslMSAFILE_PHYLIP: return TRUE; /* because seqboot. undocumented in phylip, phylip format can come out multi-msa */ case eslMSAFILE_PHYLIPS: return TRUE; /* ditto */ default: return FALSE; } return FALSE; /* keep compilers happy */ } /* Function: esl_msafile_EncodeFormat() * Synopsis: Convert text string to an MSA file format code. * * Purpose: Given a text string, match it case-insensitively * against a list of possible formats, and return the * appropriate MSA file format code. For example, * returns * . * * If the format is unrecognized, return * . * * Note: Keep in sync with , * which decodes all possible sequence file formats, * both unaligned and aligned. */ int esl_msafile_EncodeFormat(char *fmtstring) { if (strcasecmp(fmtstring, "stockholm") == 0) return eslMSAFILE_STOCKHOLM; if (strcasecmp(fmtstring, "pfam") == 0) return eslMSAFILE_PFAM; if (strcasecmp(fmtstring, "a2m") == 0) return eslMSAFILE_A2M; if (strcasecmp(fmtstring, "psiblast") == 0) return eslMSAFILE_PSIBLAST; if (strcasecmp(fmtstring, "selex") == 0) return eslMSAFILE_SELEX; if (strcasecmp(fmtstring, "afa") == 0) return eslMSAFILE_AFA; if (strcasecmp(fmtstring, "clustal") == 0) return eslMSAFILE_CLUSTAL; if (strcasecmp(fmtstring, "clustallike") == 0) return eslMSAFILE_CLUSTALLIKE; if (strcasecmp(fmtstring, "phylip") == 0) return eslMSAFILE_PHYLIP; if (strcasecmp(fmtstring, "phylips") == 0) return eslMSAFILE_PHYLIPS; return eslMSAFILE_UNKNOWN; } /* Function: esl_msafile_DecodeFormat() * Synopsis: Convert internal file format code to text string. * * Purpose: Given an internal file format code * (, for example), returns * a string suitable for printing ("Stockholm", * for example). * * Returns: a pointer to a static description string. * * Throws: If code isn't valid, throws an exception * internally, and returns . * * Note: Keep in sync with . */ char * esl_msafile_DecodeFormat(int fmt) { switch (fmt) { case eslMSAFILE_UNKNOWN: return "unknown"; case eslMSAFILE_STOCKHOLM: return "Stockholm"; case eslMSAFILE_PFAM: return "Pfam"; case eslMSAFILE_A2M: return "UCSC A2M"; case eslMSAFILE_PSIBLAST: return "PSI-BLAST"; case eslMSAFILE_SELEX: return "SELEX"; case eslMSAFILE_AFA: return "aligned FASTA"; case eslMSAFILE_CLUSTAL: return "Clustal"; case eslMSAFILE_CLUSTALLIKE: return "Clustal-like"; case eslMSAFILE_PHYLIP: return "PHYLIP (interleaved)"; case eslMSAFILE_PHYLIPS: return "PHYLIP (sequential)"; default: break; } esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such msa format code %d\n", fmt); return NULL; } /* msafile_check_selex() * Checks whether an input source appears to be in SELEX format. * * Check whether the input source appears to be a * SELEX-format alignment file, starting from the current point, * to the end of the input. Return if so, * if not. * * The checker is more rigorous than the parser. To be autodetected, * the SELEX file can't use whitespace for gaps. The parser, though, * allows whitespace. A caller would have to specific SELEX format * explicitly in that case. */ static int msafile_check_selex(ESL_BUFFER *bf) { esl_pos_t start_offset = -1; int block_nseq = 0; /* Number of seqs in each block is checked */ int nseq = 0; esl_pos_t block_nres = 0; /* Number of residues in each line is checked */ char *firstname = NULL; /* First seq name of every block is checked */ esl_pos_t namelen = 0; int blockidx = 0; int in_block = FALSE; char *p, *tok; esl_pos_t n, toklen; int status; /* Anchor at the start of the input, so we can rewind */ start_offset = esl_buffer_GetOffset(bf); if ( (status = esl_buffer_SetAnchor(bf, start_offset)) != eslOK) goto ERROR; while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK) { /* Some automatic giveaways of SELEX format */ if (esl_memstrpfx(p, n, "#=RF")) { status = eslOK; goto DONE; } if (esl_memstrpfx(p, n, "#=CS")) { status = eslOK; goto DONE; } if (esl_memstrpfx(p, n, "#=SS")) { status = eslOK; goto DONE; } if (esl_memstrpfx(p, n, "#=SA")) { status = eslOK; goto DONE; } /* skip comments */ if (esl_memstrpfx(p, n, "#")) continue; /* blank lines: end block, reset block counters */ if (esl_memspn(p, n, " \t") == n) { if (block_nseq && block_nseq != nseq) { status = eslFAIL; goto DONE;} /* each block has same # of seqs */ if (in_block) blockidx++; if (blockidx >= 3) { status = eslOK; goto DONE; } /* stop after three blocks; we're pretty sure by now */ in_block = FALSE; block_nres = 0; block_nseq = nseq; nseq = 0; continue; } /* else we're a "sequence" line. test for two and only two non-whitespace * fields; test that second field has same length; test that each block * starts with the same seq name.. */ in_block = TRUE; if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) goto ERROR; /* there's at least one token - we already checked for blank lines */ if (nseq == 0) /* check first seq name in each block */ { if (blockidx == 0) { firstname = tok; namelen = toklen; } /* First block: set the name we check against. */ else if (toklen != namelen || memcmp(tok, firstname, toklen) != 0) { status = eslFAIL; goto DONE; } /* Subsequent blocks */ } if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) { status = eslFAIL; goto DONE; } if (block_nres && toklen != block_nres) { status = eslFAIL; goto DONE; } block_nres = toklen; if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK) { status = eslFAIL; goto DONE; } nseq++; } if (status != eslEOF) goto ERROR; /* EOF is expected and good; anything else is bad */ if (in_block) blockidx++; status = (blockidx ? eslOK : eslFAIL); /* watch out for the case of no input */ /* deliberate readthru */ DONE: if (start_offset != -1) { if (esl_buffer_SetOffset(bf, start_offset) != eslOK) goto ERROR; if (esl_buffer_RaiseAnchor(bf, start_offset) != eslOK) goto ERROR; } return status; ERROR: if (start_offset != -1) { esl_buffer_SetOffset(bf, start_offset); esl_buffer_RaiseAnchor(bf, start_offset); } return status; } /*---------------- end, file format utilities -------------------*/ /***************************************************************** *# 4. Guessing alphabet *****************************************************************/ /* Function: esl_msafile_GuessAlphabet() * Synopsis: Guess what kind of sequences the MSA file contains. * * Purpose: Guess the alphabet of the sequences in the open * -- , , or -- * based on the residue composition of the input. * * Returns: Returns on success, and <*ret_type> is set * to , , or . * * Returns and sets <*ret_type> to * if the alphabet cannot be reliably * guessed. * * Either way, the 's buffer is restored to its original * state, no matter how much data we had to read while trying * to guess. * * Throws: - an allocation error. * - a system call such as failed. * - "impossible" corruption, internal bug */ int esl_msafile_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type) { int status = eslENOALPHABET; switch (afp->format) { case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break; case eslMSAFILE_PFAM: status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break; case eslMSAFILE_A2M: status = esl_msafile_a2m_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_SELEX: status = esl_msafile_selex_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_AFA: status = esl_msafile_afa_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_GuessAlphabet (afp, ret_type); break; case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_GuessAlphabet (afp, ret_type); break; } return status; } /*----------- end, utilities for alphabets ----------------------*/ /***************************************************************** *# 5. Random msa flatfile database access (with SSI) *****************************************************************/ /* Function: esl_msafile_PositionByKey() * Synopsis: Use SSI to reposition file to start of named MSA. * * Purpose: Reposition so that the next MSA we read * will be the one named (or accessioned) . * * Returns: on success, and the file is repositioned * such that the next call will read the * alignment named . * * Returns if isn't found in the index * for . * * Returns if something goes wrong trying to * read the index, indicating some sort of file format * problem in the SSI file. * * Throws: if there's no open SSI index; * if the is invalid, either requiring rewind * of a nonrewindable stream, or off the end of the data; * if a system call such as fails; * on allocation failure. * In all these cases, the state of the is uncertain * and may be corrupt; the application should not continue * to use it. */ int esl_msafile_PositionByKey(ESL_MSAFILE *afp, const char *key) { uint16_t fh; off_t offset; int status; if (afp->ssi == NULL) ESL_EXCEPTION(eslENODATA, "Need an open SSI index to call esl_msafile_PositionByKey()"); if ((status = esl_ssi_FindName(afp->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status; /* eslENOTFOUND|eslEFORMAT [eslEMEM] */ if ((status = esl_buffer_SetOffset(afp->bf, offset)) != eslOK) return status; /* [eslEINVAL|eslESYS|eslEMEM] */ /* The linenumber gets messed up after a file positioning. Best we can do * is to turn it off (set it to -1). FIX THIS next time SSI format is * changed: add linenumber to a primary key record. */ afp->linenumber = -1; return eslOK; } /*--------------------- end SSI functions -----------------------*/ /***************************************************************** *# 6. Reading MSAs from input *****************************************************************/ /* Function: esl_msafile_Read() * Synopsis: Read next MSA from input. * * Purpose: Reads the next MSA from open MSA input , and return it in * <*ret_msa>. * * Args: afp - open alignment input stream 8 *ret_msa - RETURN: alignment * * Returns: on success. * * on a parse error, and errmsg> is set * to a user-directed error message; <*ret_msa> is . * * If no alignment is found at all, returns , * and errmsg> is blank; <*ret_msa> is . * * On normal error, and the return status code may be * passed to to print diagnostics * to (including input source information and line * number) and exit. * * Throws: - an allocation failed. * - a system call such as fread() failed * - "impossible" corruption */ int esl_msafile_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; int status = eslOK; esl_pos_t offset = esl_buffer_GetOffset(afp->bf); switch (afp->format) { case eslMSAFILE_A2M: if ((status = esl_msafile_a2m_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_AFA: if ((status = esl_msafile_afa_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_CLUSTAL: if ((status = esl_msafile_clustal_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_CLUSTALLIKE: if ((status = esl_msafile_clustal_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_PFAM: if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_PHYLIP: if ((status = esl_msafile_phylip_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_PHYLIPS: if ((status = esl_msafile_phylip_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_PSIBLAST: if ((status = esl_msafile_psiblast_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_SELEX: if ((status = esl_msafile_selex_Read (afp, &msa)) != eslOK) goto ERROR; break; case eslMSAFILE_STOCKHOLM: if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break; default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format"); } msa->offset = offset; *ret_msa = msa; return eslOK; ERROR: if (msa) esl_msa_Destroy(msa); *ret_msa = NULL; return status; } /* Function: esl_msafile_ReadFailure() * Synopsis: Report diagnostics of a normal error in parsing MSA file, and exit. * * Purpose: Report user-directed diagnostics of a normal error from * parsing an MSA file. Output the error message to * , along with information about what we were * parsing (filename, if it was a file) and where we were * in the input (linenumber, if we know it). This * information is all available in . Then close * and exit with the provided by the caller. * * Args: afp - open ESL_MSAFILE, containing information about * the error and the input source. * status - exit status; generally eslEFORMAT. * * Returns: no return. Exits here with . */ void esl_msafile_ReadFailure(ESL_MSAFILE *afp, int status) { switch (status) { case eslEFORMAT: fprintf(stderr, "Alignment input parse error:\n %s\n", afp->errmsg); break; case eslEOF: fprintf(stderr, "Alignment input appears empty?\n"); break; default: fprintf(stderr, "Alignment input read error; unexpected code %d\n", status); break; } switch (afp->bf->mode_is) { case eslBUFFER_STREAM: fprintf(stderr, " while reading %s from an input stream (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break; case eslBUFFER_CMDPIPE: fprintf(stderr, " while reading %s through a pipe (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break; case eslBUFFER_FILE: case eslBUFFER_ALLFILE: case eslBUFFER_MMAP: fprintf(stderr, " while reading %s file %s\n", esl_msafile_DecodeFormat(afp->format), afp->bf->filename); break; case eslBUFFER_STRING: fprintf(stderr, " while reading %s from a provided string (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break; default: break; } if (afp->linenumber > 0) fprintf(stderr, " at or near line %" PRIu64 "\n", afp->linenumber); else fprintf(stderr, " at or near byte %" PRIu64 "\n", esl_buffer_GetOffset(afp->bf)); esl_msafile_Close(afp); exit(status); } /*------------ end, reading MSA from ESL_MSAFILE ---------------*/ /***************************************************************** *# 7. Writing an MSA to a stream. *****************************************************************/ /* Function: esl_msafile_Write() * Synopsis: Write an MSA to a stream. * * Purpose: Writes alignment to open stream in format . * * In general, the is unchanged, but there are some * exceptions. For example, writing an alignment in A2M format * will alter alignment data (marking missing data * symbols on heuristically defined sequence fragments) and * create an <\#=RF> annotation line, if an rf> * annotation line isn't already present in the . * * Args: fp - open stream (such as ) * msa - alignment to write * fmt - format code (such as ) * * Returns: on success. * * Throws: on allocation error. * on any system write error, such as a filled disk. */ int esl_msafile_Write(FILE *fp, ESL_MSA *msa, int fmt) { int status; switch (fmt) { case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_STOCKHOLM); break; case eslMSAFILE_PFAM: status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_PFAM); break; case eslMSAFILE_A2M: status = esl_msafile_a2m_Write (fp, msa); break; case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_Write (fp, msa); break; case eslMSAFILE_SELEX: status = esl_msafile_selex_Write (fp, msa); break; case eslMSAFILE_AFA: status = esl_msafile_afa_Write (fp, msa); break; case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_Write (fp, msa, eslMSAFILE_CLUSTAL); break; case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_Write (fp, msa, eslMSAFILE_CLUSTALLIKE); break; case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_Write (fp, msa, eslMSAFILE_PHYLIP, NULL); break; case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_Write (fp, msa, eslMSAFILE_PHYLIPS, NULL); break; default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format"); } return status; } /*-------------- end, writing MSA to stream ---------------------*/ /***************************************************************** *# 8. MSA functions that depend on MSAFILE *****************************************************************/ /* Function: esl_msa_CreateFromString() * Synopsis: Creates a small from a test case string. * * Purpose: A convenience for making small test cases in the test * suites: given the contents of a complete multiple * sequence alignment file as a single string in * alignment format , convert it to an . * * For example, * ``` * esl_msa_CreateFromString("# STOCKHOLM 1.0\n" * "seq1 AAAAA\n" * "seq2 AAAAA\n" * "//\n", eslMSAFILE_STOCKHOLM); * ``` * creates an ungapped alignment of two AAAAA sequences. * (Using string concatenation in C99 makes for a cleaner * looking multi-line string.) * * The is in text mode. If you want it in digital * mode, use on it. * * Returns: a pointer to the new on success. * * Throws: if it fails to obtain, open, or read the temporary file * that it puts the string in. */ ESL_MSA * esl_msa_CreateFromString(const char *s, int fmt) { ESL_MSAFILE *mfp = NULL; ESL_MSA *msa = NULL; if (esl_msafile_OpenMem(NULL, s, -1, fmt, NULL, &mfp) != eslOK) goto ERROR; if (esl_msafile_Read(mfp, &msa) != eslOK) goto ERROR; esl_msafile_Close(mfp); return msa; ERROR: if (mfp) esl_msafile_Close(mfp); if (msa) esl_msa_Destroy(msa); return NULL; } /***************************************************************** *# 9. Utilities used by specific format parsers. *****************************************************************/ /* Function: esl_msafile_GetLine() * Synopsis: Read next line of input alignment file. * * Purpose: Read next line of input , into its internal * data fields: line> points to the start of the * line in bf>, n> is its length in * bytes, lineoffset> is the offset in the input * to the start of the line, and linenumber> is * the linenumber from <1..N> for total lines in the * input. * * Optionally, caller can request <*opt_p>, <*opt_n>, * which are set to line>,n>. This gives the * caller a modifiable line pointer that it can step * through, while line> is preserved for possible * diagnostics if anything goes awry. * * Args: : an open alignment file input * <*opt_p> : optRETURN: modifiable copy of line> pointer * <*opt_n> : optRETURN: modifiable copy of n> * * Returns: on success. * * at EOF. Now line> is , n> * is <0>, and lineoffset> is <0>. linenumber> * is the total number of lines in the input. * * Throws: if an allocation fails. * if a system call fails, such as fread(). * on internal code errors. */ int esl_msafile_GetLine(ESL_MSAFILE *afp, char **opt_p, esl_pos_t *opt_n) { int status; afp->lineoffset = esl_buffer_GetOffset(afp->bf); if ((status = esl_buffer_GetLine(afp->bf, &(afp->line), &(afp->n))) != eslOK) goto ERROR; if (afp->linenumber != -1) afp->linenumber++; if (opt_p) *opt_p = afp->line; if (opt_n) *opt_n = afp->n; return eslOK; ERROR: afp->line = NULL; afp->n = 0; afp->lineoffset = -1; /* leave linenumber alone. on EOF, it is the number of lines in the file, and that might be useful. */ if (opt_p) *opt_p = NULL; if (opt_n) *opt_n = 0; return status; } /* Function: esl_msafile_PutLine() * Synopsis: Put the line we just read back in the input stream * * Purpose: Put the line we just read back in the input stream * and unset line> and its associated information * internally. The next call * will read exactly the same line again. * * This gets used in parsing files that contain multiple * MSAs. If the way we determine that an MSA record has * ended is by reading the first line of the next MSA * record, then we may want to stuff it back in the input * buffer, so it gets parsed properly as part of the next * record. In Pfam/Stockholm parsing we don't have to * do this, because the first line is just a format code, * with no record-specific data. But in PHYLIP multiple MSA * format, for example, the first line is nseq,alen. * * Args: afp - the open input stream * * Returns: on succes * * Throws: , , if the * call fails. */ int esl_msafile_PutLine(ESL_MSAFILE *afp) { int status; if ((status = esl_buffer_Set(afp->bf, afp->line, 0)) != eslOK) return status; afp->line = NULL; afp->n = 0; if (afp->linenumber != -1) afp->linenumber--; afp->lineoffset = -1; return eslOK; } /*--------------- end, parser utilities -------------------------*/ /***************************************************************** * 10. Unit tests *****************************************************************/ #ifdef eslMSAFILE_TESTDRIVE static void utest_format2format(int fmt1, int fmt2) { char msg[] = "esl_msafile: format2format unit test failed"; char tmpfile1[32] = "esltmpXXXXXX"; char tmpfile2[32] = "esltmpXXXXXX"; char tmpfile3[32] = "esltmpXXXXXX"; FILE *ofp = NULL; ESL_MSA *msa1, *msa2, *msa3, *msa4; ESL_MSAFILE *afp; int alphatype = eslAMINO; ESL_ALPHABET *abc = esl_alphabet_Create(alphatype); ESL_ALPHABET *abc2 = NULL; char *testmsa = "\ # STOCKHOLM 1.0\n\ seq1 ACDEFGHIKLMNPQRSTVWYacdefghiklmnpq------ACDEFGHIKLMNPQRSTVWYacde......mnpqrstvwyACDEFGHI______RSTVWYacdefghiklmnpqrstvwy\n\ seq2 ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy\n\ //\n"; /* Create the test msa, msa1, digital mode, no autodetections */ if ( esl_msafile_OpenMem(&abc, testmsa, strlen(testmsa), eslMSAFILE_STOCKHOLM, NULL, &afp) != eslOK) esl_fatal(msg); if ( esl_msafile_Read(afp, &msa1) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); /* Write it to tmpfile1 in fmt1. (This exercises writing of digital MSAs, in all formats) */ if ( esl_tmpfile_named(tmpfile1, &ofp) != eslOK) esl_fatal(msg); if ( esl_msafile_Write(ofp, msa1, fmt1) != eslOK) esl_fatal(msg); fclose(ofp); /* Read it back from in TEXT mode (verbatim), with format autodetection (except A2M) */ if (fmt1 != eslMSAFILE_A2M) { if ( esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg); if (fmt1 == eslMSAFILE_PFAM && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM; if (fmt1 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX) afp->format = eslMSAFILE_PSIBLAST; if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); } else // without autodetection: { if ( esl_msafile_Open(NULL, tmpfile1, NULL, fmt1, NULL, &afp) != eslOK) esl_fatal(msg); if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); } /* Write it to tmpfile2 in fmt2. (This exercises writing of text-mode MSAs, in all formats) */ if ( esl_tmpfile_named(tmpfile2, &ofp) != eslOK) esl_fatal(msg); if ( esl_msafile_Write(ofp, msa2, fmt2) != eslOK) esl_fatal(msg); fclose(ofp); /* Read it back in TEXT mode, with format autodetection (except A2M) */ if (fmt2 != eslMSAFILE_A2M) { if ( esl_msafile_Open(NULL, tmpfile2, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg); if (fmt2 == eslMSAFILE_PFAM && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM; if (fmt2 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX) afp->format = eslMSAFILE_PSIBLAST; if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); } else // without autodetection: { if ( esl_msafile_Open(NULL, tmpfile2, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg); if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); } /* Write it to tmpfile4 in fmt2. (This exercises writing of digital-mode MSAs, in all formats */ if ( esl_tmpfile_named(tmpfile3, &ofp) != eslOK) esl_fatal(msg); if ( esl_msafile_Write(ofp, msa3, fmt2) != eslOK) esl_fatal(msg); fclose(ofp); /* Read it back in DIGITAL mode, with alphabet autodetection but not format */ if ( esl_msafile_Open(&abc2, tmpfile3, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg); if ( esl_msafile_Read(afp, &msa4) != eslOK) esl_fatal(msg); esl_msafile_Close(afp); /* Now: * msa1 = digital mode test alignment, created from Stockholm string * msa2 = TEXT mode, read from tmpfile1 * msa3 = TEXT mode, read from tmpfile2 * msa4 = digital mode, read from tmpfile3 * So we expect: * msa2==msa3 * msa1==msa4 */ /* some normalization before comparing alignments. */ esl_msa_SymConvert(msa2, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-"); esl_msa_SymConvert(msa3, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-"); if (msa2->rf) { free (msa2->rf); msa2->rf = NULL; } if (msa3->rf) { free (msa3->rf); msa3->rf = NULL; } if (msa4->rf) { free (msa4->rf); msa4->rf = NULL; } if (esl_msa_Compare(msa2, msa3) != eslOK) esl_fatal(msg); if (esl_msa_Compare(msa1, msa4) != eslOK) esl_fatal(msg); remove(tmpfile1); remove(tmpfile2); remove(tmpfile3); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); esl_msa_Destroy(msa3); esl_msa_Destroy(msa4); esl_alphabet_Destroy(abc); esl_alphabet_Destroy(abc2); } #endif /*eslMSAFILE_TESTDRIVE*/ /*----------------- end, unit tests -----------------------------*/ /***************************************************************** * 11. Test driver *****************************************************************/ #ifdef eslMSAFILE_TESTDRIVE /* compile: gcc -g -Wall -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm * run: ./esl_msafile_utest */ #include #include #include "easel.h" #include "esl_getopts.h" #include "esl_msafile.h" static ESL_OPTIONS options[] = { /* name type default env range togs reqs incomp help docgrp */ {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0}, { 0,0,0,0,0,0,0,0,0,0}, }; static char usage[] = "[-options]"; static char banner[] = "test driver for MSA input/output format module"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); int fmt1, fmt2; for (fmt1 = eslMSAFILE_STOCKHOLM; fmt1 <= eslMSAFILE_PHYLIPS; fmt1++) for (fmt2 = eslMSAFILE_STOCKHOLM; fmt2 <= eslMSAFILE_PHYLIPS; fmt2++) utest_format2format(fmt1, fmt2); esl_getopts_Destroy(go); exit(0); } #endif /*eslMSAFILE_TESTDRIVE*/ /*----------------- end, test driver ----------------------------*/ /***************************************************************** * 12. Examples. *****************************************************************/ #ifdef eslMSAFILE_EXAMPLE /*::cexcerpt::msafile_example::begin::*/ #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_msa.h" #include "esl_msafile.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-i", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show info, instead of converting format", 0 }, { "--informat", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "specify the input MSA file is in format ", 0 }, { "--outformat", eslARG_STRING, "Clustal", NULL, NULL, NULL, NULL, NULL, "write the output MSA in format ", 0 }, { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 }, { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 }, { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 }, { "--text", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] "; static char banner[] = "example of multiple alignment input and output using the msafile module(s)"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); int infmt = eslMSAFILE_UNKNOWN; int outfmt = eslMSAFILE_UNKNOWN; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int textmode = esl_opt_GetBoolean(go, "--text"); int showinfo = esl_opt_GetBoolean(go, "-i"); int nali = 0; int status; /* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* If you know the MSA file format, set it (, here). */ if (esl_opt_IsOn(go, "--informat") && (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat")); /* Open in text or digital mode. * To let the Open() function autoguess the format, you pass . * To let it autoguess the alphabet, you set and pass <&abc>. * To open in text mode instead of digital, you pass for the alphabet argument. * esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any * open failure to . You can of course handle your own diagnostics instead. */ if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, infmt, NULL, &afp); else status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp); if (status != eslOK) esl_msafile_OpenFailure(afp, status); if (showinfo) { printf("# Format: %s\n", esl_msafile_DecodeFormat(afp->format)); printf("# Alphabet: %s\n", (afp->abc ? esl_abc_DecodeType(afp->abc->type) : "text mode")); } /* Choose the output file format */ if ( (outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --outformat", esl_opt_GetString(go, "--outformat")); while ((status = esl_msafile_Read(afp, &msa)) == eslOK) { /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data; * if text MSA: msa->aseq[idx=0..nseq-1][acol=0..alen-1] */ nali++; if (showinfo) printf("# alignment %5d: %15s: %6d seqs, %5d columns\n\n", nali, msa->name, (int) msa->nseq, (int) msa->alen); else esl_msafile_Write(stdout, msa, outfmt); esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */ esl_alphabet_Destroy(abc); esl_msafile_Close(afp); esl_getopts_Destroy(go); exit(0); } /*::cexcerpt::msafile_example::end::*/ #endif /*eslMSAFILE_EXAMPLE*/ /*------------------------ end of examples -----------------------*/