/* Unaligned ascii sequence file i/o. * * Contents: * 1. An object, in text mode. * 2. An object, in digital mode. * 3. Miscellaneous routines. * 4. Sequence reading (sequential). * 5. Sequence/subsequence fetching, random access * 6. Internal routines shared by parsers. * 7. Internal routines for EMBL format (including UniProt, TrEMBL) * 8. Internal routines for GenBank format * 9. Internal routines for FASTA format * 10. Internal routines for daemon format * 11. Internal routines for HMMPGMD format * * This module shares remote evolutionary homology with Don Gilbert's * seminal, public domain ReadSeq package, though the last common * ancestor was circa 1991 and no recognizable vestiges are likely to * remain. Thanks Don! * */ #include #include #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_sqio.h" #include "esl_sq.h" #include "esl_ssi.h" /* format specific routines */ static int sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt); static int sqascii_Position (ESL_SQFILE *sqfp, off_t offset); static void sqascii_Close (ESL_SQFILE *sqfp); static int sqascii_SetDigital (ESL_SQFILE *sqfp, const ESL_ALPHABET *abc); static int sqascii_GuessAlphabet (ESL_SQFILE *sqfp, int *ret_type); static int sqascii_Read (ESL_SQFILE *sqfp, ESL_SQ *sq); static int sqascii_ReadInfo (ESL_SQFILE *sqfp, ESL_SQ *sq); static int sqascii_ReadSequence (ESL_SQFILE *sqfp, ESL_SQ *sq); static int sqascii_ReadWindow (ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq); static int sqascii_ReadBlock (ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int max_init_window, int long_target); static int sqascii_Echo (ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp); static int sqascii_IsRewindable (const ESL_SQFILE *sqfp); static const char *sqascii_GetError (const ESL_SQFILE *sqfp); static int sqascii_OpenSSI (ESL_SQFILE *sqfp, const char *ssifile_hint); static int sqascii_PositionByKey (ESL_SQFILE *sqfp, const char *key); static int sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which); static int sqascii_Fetch (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq); static int sqascii_FetchInfo (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq); static int sqascii_FetchSubseq (ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq); /* Internal routines shared by parsers. */ static int loadmem (ESL_SQFILE *sqfp); static int loadbuf (ESL_SQFILE *sqfp); static int nextchar (ESL_SQFILE *sqfp, char *ret_c); static int seebuf (ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos); static void addbuf (ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres); static void skipbuf (ESL_SQFILE *sqfp, int64_t nskip); static int read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres); static int skip_whitespace(ESL_SQFILE *sqfp); /* EMBL format; also UniProt, TrEMBL */ static void config_embl(ESL_SQFILE *sqfp); static void inmap_embl (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap); static int header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq); static int skip_embl (ESL_SQFILE *sqfp, ESL_SQ *sq); static int end_embl (ESL_SQFILE *sqfp, ESL_SQ *sq); /* GenBank format; also DDBJ */ static void config_genbank(ESL_SQFILE *sqfp); static void inmap_genbank (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap); static int header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq); static int skip_genbank (ESL_SQFILE *sqfp, ESL_SQ *sq); static int end_genbank (ESL_SQFILE *sqfp, ESL_SQ *sq); /* FASTA format */ static void config_fasta(ESL_SQFILE *sqfp); static void inmap_fasta (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap); static int header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq); static int skip_fasta (ESL_SQFILE *sqfp, ESL_SQ *sq); static int end_fasta (ESL_SQFILE *sqfp, ESL_SQ *sq); /* daemon format */ static void config_daemon(ESL_SQFILE *sqfp); static void inmap_daemon (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap); static int end_daemon (ESL_SQFILE *sqfp, ESL_SQ *sq); /* HMMPGMD format */ static int fileheader_hmmpgmd(ESL_SQFILE *sqfp); /***************************************************************** *# 1. An object, in text mode. *****************************************************************/ /* Function: esl_sqascii_Open() * Synopsis: Open a sequence file for reading. * * Purpose: Open a sequence file for reading. * The opened is returned through . * * The format of the file is asserted to be (for * example, ). If is * then the routine attempts to * autodetect the file format. * * There are two special cases for . If * is "-", the sequence data are read from a * pipe. If ends in ".gz", the file is * assumed to be compressed with , and it is opened * by a pipe from . Reading gzip files only works * on POSIX-compliant systems that have pipes * (specifically, the POSIX.2 popen() call). * * Returns: on success, and <*ret_sqfp> points to a new * open . Caller deallocates this object with * . * * Returns if can't be found or * opened. Returns if the file is empty, or * if autodetection is attempted and the format can't be * determined. On any error condition, <*ret_sqfp> is * returned NULL. * * Throws: on allocation failure. */ int esl_sqascii_Open(char *filename, int format, ESL_SQFILE *sqfp) { int status;/* return status from an ESL call */ int n; int nc; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* before we go any further, make sure we can handle the format */ if (format == eslSQFILE_NCBI) return eslENOTFOUND; /* Default initializations */ ascii->fp = NULL; ascii->do_gzip = FALSE; ascii->do_stdin = FALSE; ascii->do_buffer = FALSE; ascii->mem = NULL; ascii->allocm = 0; ascii->mn = 0; ascii->mpos = 0; ascii->moff = -1; ascii->is_recording = FALSE; ascii->buf = NULL; ascii->boff = 0; ascii->balloc = 0; ascii->nc = 0; ascii->bpos = 0; ascii->L = 0; ascii->linenumber = 1; ascii->bookmark_offset = 0; ascii->bookmark_linenum = 0; ascii->is_linebased = FALSE; ascii->eof_is_ok = FALSE; ascii->parse_header = NULL; ascii->skip_header = NULL; ascii->parse_end = NULL; ascii->afp = NULL; ascii->msa = NULL; ascii->idx = -1; ascii->ssifile = NULL; ascii->rpl = -1; /* -1 = not set yet */ ascii->bpl = -1; /* (ditto) */ ascii->prvrpl = -1; /* (ditto) */ ascii->prvbpl = -1; /* (ditto) */ ascii->currpl = -1; ascii->curbpl = -1; ascii->ssi = NULL; /* MSA formats are handled entirely by msafile module - * let it handle stdin, .gz, etc */ if (! esl_sqio_IsAlignment(format)) { if (strcmp(filename, "-") == 0) /* stdin special case */ { ascii->fp = stdin; ascii->do_stdin = TRUE; } else { /* Check the current working directory first. */ if ((ascii->fp = fopen(filename, "r")) == NULL) { status = eslENOTFOUND; goto ERROR; } } /* Deal with the .gz special case: to popen(), "success" only means * it found and executed gzip -dc. If gzip -dc doesn't find our * file, popen() still blithely returns success, so we have to be * sure the file exists. That's why we fopen()'ed it above, only to * close it and popen() it here. */ #ifdef HAVE_POPEN n = strlen(filename); if (n > 3 && strcmp(filename+n-3, ".gz") == 0) { char *cmd; fclose(ascii->fp); nc = strlen("gzip -dc ") + n + 1; ESL_ALLOC(cmd, nc); snprintf(cmd, nc, "gzip -dc %s", filename); ascii->fp = popen(cmd, "r"); if (ascii->fp == NULL) { status = eslENOTFOUND; goto ERROR; } ascii->do_gzip = TRUE; free(cmd); } #endif /*HAVE_POPEN*/ /* If we don't know the format yet, try to autodetect it now. */ if (format == eslSQFILE_UNKNOWN) { status = sqascii_GuessFileFormat(sqfp, &format); if (status == eslOK) sqfp->format = format; else if (status != eslEFORMAT) goto ERROR; /* might still be eslSQFILE_UNKNOWN, for MSA files */ } /* If the format is still unknown, it may be an MSA file. The * msafile module is capable of autodetecting format even in a .gz * or stdin pipe, but the stuff above has already read from these * nonrewindable sources, trying to guess an unaligned format. We * could open a second .gz pipe, but that's ugly; and in any case, * we can't rewind stdin. Eventually, this will get resolved, by * having sqio open an ESL_BUFFER, then doing an * esl_msafile_OpenBuffer() if we need to hand control to the * msafile module. For now, sqio is already documented to be * unable to autodetect MSA file formats in stdin or .gz pipes, * so leave it that way. */ if (format == eslSQFILE_UNKNOWN && (ascii->do_gzip || ascii->do_stdin)) { status = eslEFORMAT; goto ERROR; } } /* If format is definitely an MSA, open it through the msafile interface. * Or, if format is still unknown, try to open the file as an MSA file, * using msafile autodetection. */ if (format == eslSQFILE_UNKNOWN || esl_sqio_IsAlignment(format)) { status = esl_msafile_Open(NULL, filename, NULL, format, NULL, &(ascii->afp)); if (status != eslOK) { status = eslEFORMAT; goto ERROR; } /* This was our last attempt. Failure to open == failure to detect format */ sqfp->format = format = ascii->afp->format; } if (format == eslSQFILE_UNKNOWN) { status = eslEFORMAT; goto ERROR; } /* Configure the 's parser and inmaps for this format. */ if (!esl_sqio_IsAlignment(format)) { switch (format) { case eslSQFILE_EMBL: config_embl(sqfp); inmap_embl(sqfp, NULL); break; case eslSQFILE_UNIPROT: config_embl(sqfp); inmap_embl(sqfp, NULL); break; case eslSQFILE_GENBANK: config_genbank(sqfp); inmap_genbank(sqfp, NULL); break; case eslSQFILE_DDBJ: config_genbank(sqfp); inmap_genbank(sqfp, NULL); break; case eslSQFILE_FASTA: config_fasta(sqfp); inmap_fasta(sqfp, NULL); break; case eslSQFILE_DAEMON: config_daemon(sqfp); inmap_daemon(sqfp, NULL); break; case eslSQFILE_HMMPGMD: config_fasta(sqfp); inmap_fasta(sqfp, NULL); break; default:status = eslEFORMAT; goto ERROR; } /* Preload the first line or chunk of file. */ status = loadbuf(sqfp); if (status == eslEOF) { status = eslEFORMAT; goto ERROR; } else if (status != eslOK) { goto ERROR; } /* hmmpgmd is a special case: we need to skip first line before parsing it. * generalize that a little: this could be a section for parsing a file header, * and leaving the buf positioned at the first char of the first record * (just as expected if there's no file header) */ switch (format) { case eslSQFILE_HMMPGMD: status = fileheader_hmmpgmd(sqfp); break; default: status = eslOK; break; } if (status != eslOK) goto ERROR; } else { ascii->is_linebased = TRUE; ascii->eof_is_ok = FALSE; /* no-op for msa's */ ascii->parse_header = NULL; /* no-op for msa's */ ascii->skip_header = NULL; /* no-op for msa's */ ascii->parse_end = NULL; /* no-op for msa's */ } /* initialize the function pointers for the ascii routines */ sqfp->position = &sqascii_Position; sqfp->close = &sqascii_Close; sqfp->set_digital = &sqascii_SetDigital; sqfp->guess_alphabet = &sqascii_GuessAlphabet; sqfp->is_rewindable = &sqascii_IsRewindable; sqfp->read = &sqascii_Read; sqfp->read_info = &sqascii_ReadInfo; sqfp->read_seq = &sqascii_ReadSequence; sqfp->read_window = &sqascii_ReadWindow; sqfp->echo = &sqascii_Echo; sqfp->read_block = &sqascii_ReadBlock; sqfp->open_ssi = &sqascii_OpenSSI; sqfp->pos_by_key = &sqascii_PositionByKey; sqfp->pos_by_number = &sqascii_PositionByNumber; sqfp->fetch = &sqascii_Fetch; sqfp->fetch_info = &sqascii_FetchInfo; sqfp->fetch_subseq = &sqascii_FetchSubseq; sqfp->get_error = &sqascii_GetError; return eslOK; ERROR: sqascii_Close(sqfp); return status; } /* Function: sqascii_GuessFileFormat() * Synopsis: Guess the format of an open . * * Purpose: Try to guess the sequence file format of , and * return the format code in <*ret_fmt>. * * First we attempt to guess based on the 's * suffix. <*.fa> is assumed to be in FASTA format; <*.gb> * is assumed to be in GenBank format. * * If that fails, we attempt to guess based on peeking at * the first nonblank line of . If the line * starts with $>$, we assume FASTA format; if the line * starts with , we assume EMBL format; if the line * starts with or it contains the string we assume GenBank format. * * If that fails too, return an error, and * <*ret_fmt> is set to . * * Returns: on success, and <*ret_fmt> contains * a valid sequence file format code, such as * . * * Returns if we opened but it * contains no nonblank lines, or if we peeked at the first * nonblank line and still couldn't guess the format; * <*ret_fmt> is then . * * Throws: on allocation failure. */ static int sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt) { int n = strlen(sqfp->filename); const char *sfx = NULL; int is_gzip = FALSE; int nsfx; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* On any premature exit, *ret_fmt is eslSQFILE_UNKNOWN */ *ret_fmt = eslSQFILE_UNKNOWN; /* Is gzip'ed? Look at suffix. */ if (n > 3 && strcmp(sqfp->filename+n-3, ".gz") == 0) is_gzip = TRUE; /* Locate the suffix that might indicate format (ignore .gz) */ for (nsfx = 1, sfx = sqfp->filename + n - 1 - (is_gzip ? 3 : 0); sfx != sqfp->filename && *sfx != '.'; sfx--) nsfx++; /* now sfx points either to filename (we didn't find a suffix) or to the . of the suffix, * and nsfx is the suffix length inclusive of the . */ /* Attempt to guess file format based on file name suffix. */ if (nsfx && strcmp(sfx, ".fa") == 0) { *ret_fmt = eslSQFILE_FASTA; return eslOK; } else if (nsfx && strcmp(sfx, ".gb") == 0) { *ret_fmt = eslSQFILE_GENBANK; return eslOK; } /* If that didn't work, we'll have a peek at the stream; * turn recording on, and set for line based input. */ if (ascii->is_recording == -1) ESL_EXCEPTION(eslEINVAL, "sq file already too advanced"); ascii->is_recording = TRUE; ascii->is_linebased = TRUE; loadbuf(sqfp);/* now ascii->buf is a line of the file */ /* get first nonblank line */ while (esl_str_IsBlank(ascii->buf)) { status = loadbuf(sqfp); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, ascii->errbuf, "No data found in file"); else if (status != eslOK) goto ERROR; } /* formats that can be determined from the first line: */ if (*(ascii->buf) == '>') *ret_fmt = eslSQFILE_FASTA; else if (strncmp(ascii->buf, "ID ", 5) == 0) *ret_fmt = eslSQFILE_EMBL; else if (strncmp(ascii->buf, "LOCUS ", 8) == 0) *ret_fmt = eslSQFILE_GENBANK; else if (strstr(ascii->buf, "Genetic Sequence Data Bank") != NULL) *ret_fmt = eslSQFILE_GENBANK; /* reset the sqfp */ ascii->mpos = 0; ascii->is_recording = FALSE; ascii->is_linebased = FALSE; free(ascii->buf); ascii->buf = NULL; ascii->balloc = 0; return (*ret_fmt == eslSQFILE_UNKNOWN) ? eslEFORMAT : eslOK; ERROR: ascii->mpos = 0; ascii->is_recording = FALSE; ascii->is_linebased = FALSE; if (ascii->buf != NULL) { free(ascii->buf); ascii->balloc = 0; } return status; } /* Function: sqascii_Position() * Synopsis: Reposition an open sequence file to an offset. * * Purpose: Reposition an open to offset . * would usually be the first byte of a * desired sequence record. * * Only normal sequence files can be positioned to a * nonzero offset. If corresponds to a standard * input stream or gzip -dc stream, it may not be * repositioned. If corresponds to a multiple * sequence alignment file, the only legal * is 0, to rewind the file to the beginning and * be able to read the entire thing again. * * After is called on a nonzero * , linenumber> and other bookkeeping * information is unknown. If caller knows it, it should * set it explicitly. * * See the SSI module for manipulating offsets and indices. * * Returns: on success; * if no data can be read from this position. * * Throws: if the fseeko() or fread() call fails. * on (re-)allocation failure. * if the is not positionable. * if in trying to rewind an alignment file * by closing and reopening it, the open fails. * On errors, the state of is indeterminate, and * it should not be used again. */ static int sqascii_Position(ESL_SQFILE *sqfp, off_t offset) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't Position() in standard input"); if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't Position() in a gzipped file"); if (offset < 0) ESL_EXCEPTION(eslEINVAL, "bad offset"); if (offset > 0 && ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't use esl_sqfile_Position() w/ nonzero offset on MSA file"); if (esl_sqio_IsAlignment(sqfp->format)) {/* msa file: close and reopen. maybe sometime we'll have esl_msafile_Rewind() */ /* we have already verified that offset==0 for MSA file */ esl_msafile_Close(ascii->afp); if (ascii->msa != NULL) esl_msa_Destroy(ascii->msa); ascii->afp = NULL; ascii->msa = NULL; ascii->idx = 0; /* we know we successfully opened it the first time, so a failure to reopen is an exception, not a user-reportable normal error. ENOTFOUND is the only normal error; EFORMAT error can't occur because we know the format and don't use autodetection. */ status = esl_msafile_Open(NULL, sqfp->filename, NULL, sqfp->format, NULL, &(ascii->afp)); if (status == eslENOTFOUND) ESL_EXCEPTION(eslENOTFOUND, "failed to reopen alignment file"); else if (status != eslOK) return status; } else/* normal case: unaligned sequence file */ { if (fseeko(ascii->fp, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseeko() failed"); ascii->currpl = -1; ascii->curbpl = -1; ascii->prvrpl = -1; ascii->prvbpl = -1; ascii->linenumber = (offset == 0) ? 1 : -1; /* -1 is "unknown" */ ascii->L = -1; ascii->mpos = ascii->mn;/* this forces loadbuf to load new data */ if ((status = loadbuf(sqfp)) != eslOK) return status; } return eslOK; } /* Function: sqascii_Close() * Synopsis: Close a sequence file. * * Purpose: Closes an open . * * Returns: (void). */ static void sqascii_Close(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; #ifdef HAVE_POPEN if (ascii->do_gzip) pclose(ascii->fp); else #endif if (! ascii->do_stdin && ascii->fp != NULL) fclose(ascii->fp); if (ascii->ssifile != NULL) free(ascii->ssifile); if (ascii->mem != NULL) free(ascii->mem); if (ascii->balloc > 0) free(ascii->buf); if (ascii->ssi != NULL) esl_ssi_Close(ascii->ssi); if (ascii->afp != NULL) esl_msafile_Close(ascii->afp); if (ascii->msa != NULL) esl_msa_Destroy(ascii->msa); ascii->do_gzip = FALSE; ascii->do_stdin = FALSE; ascii->fp = NULL; ascii->ssifile = NULL; ascii->mem = NULL; ascii->balloc = 0; ascii->buf = NULL; ascii->ssi = NULL; ascii->afp = NULL; ascii->msa = NULL; return; } /*------------------- ESL_SQFILE open/close -----------------------*/ /***************************************************************** *# 2. An object, in digital mode [with ] *****************************************************************/ /* Function: sqascii_SetDigital() * Synopsis: Set an open to read in digital mode. * * Purpose: Given an that's already been opened, * configure it to expect subsequent input to conform * to the digital alphabet . * * Calling is * equivalent to . The two-step * version is useful when you need a * call in between, guessing * the file's alphabet in text mode before you set it to * digital mode. * * Returns: on success. */ static int sqascii_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc) { int status = eslOK; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (!esl_sqio_IsAlignment(sqfp->format)) { switch (sqfp->format) { case eslSQFILE_EMBL: inmap_embl(sqfp, abc->inmap); break; case eslSQFILE_UNIPROT: inmap_embl(sqfp, abc->inmap); break; case eslSQFILE_GENBANK: inmap_genbank(sqfp, abc->inmap); break; case eslSQFILE_DDBJ: inmap_genbank(sqfp, abc->inmap); break; case eslSQFILE_FASTA: inmap_fasta(sqfp, abc->inmap); break; case eslSQFILE_DAEMON: inmap_daemon(sqfp, abc->inmap); break; default: status = eslEFORMAT; break; } } else esl_msafile_SetDigital(ascii->afp, abc); return status; } /* Function: sqascii_GuessAlphabet() * Synopsis: Guess the alphabet of an open . * * Purpose: After opening , attempt to guess what alphabet * its sequences are in, by inspecting the first sequence * in the file, and return this alphabet type in <*ret_type>. * * Returns: on success, and <*ret_type> is set to , * , or . * * Returns and sets <*ret_type> to * if the first sequence (or alignment) * in the file contains no more than ten residues total, * or if its alphabet cannot be guessed (i.e. it contains * IUPAC degeneracy codes, but no amino acid specific * residues). * * Returns if a parse error is encountered in * trying to read the sequence file. errbuf> is set * to a useful error message if this occurs, * linenumber> is the line on which the error * occurred, and <*ret_type> is set to . * * Returns and sets <*ret_type> to * if the file appears to be empty. * * Throws: on allocation error; * on unimaginable internal errors. */ static int sqascii_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type) { ESL_SQ *sq = NULL; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* Special case: for MSA files, hand this off to msafile_GuessAlphabet. */ if (esl_sqio_IsAlignment(sqfp->format)) return esl_msafile_GuessAlphabet(ascii->afp, ret_type); /* set the sqfp to record; we'll rewind afterwards and use the recording */ ascii->is_recording = TRUE; if ((sq = esl_sq_Create()) == NULL) { status = eslEMEM; goto ERROR; } status = sqascii_ReadWindow(sqfp, 0, 4000, sq); if (status == eslEOF) { status = eslENODATA; goto ERROR; } else if ((status != eslOK) && (status != eslEOD)) goto ERROR; if ((status = esl_sq_GuessAlphabet(sq, ret_type)) != eslOK) goto ERROR; /* reset the sqfp, so it uses the recording next */ ascii->mpos = 0; ascii->linenumber = 1; ascii->is_recording = FALSE; if ((status = loadbuf(sqfp)) != eslOK) ESL_EXCEPTION(status, "buffer load failed, but shouldn't have"); esl_sq_Destroy(sq); return eslOK; ERROR: esl_sq_Destroy(sq); *ret_type = eslUNKNOWN; return status; } /*-------------- end, digital mode ESL_SQFILE -------------------*/ /***************************************************************** *# 3. Miscellaneous routines *****************************************************************/ /* Function: sqascii_IsRewindable() * Synopsis: Return if can be rewound. * * Purpose: Returns if can be rewound (positioned * to an offset of zero), in order to read it a second * time. */ static int sqascii_IsRewindable(const ESL_SQFILE *sqfp) { if (sqfp->data.ascii.do_gzip == TRUE) return FALSE; if (sqfp->data.ascii.do_stdin == TRUE) return FALSE; return TRUE; } /* Function: sqascii_GetError() * Synopsis: Return if can be rewound. * * Purpose: Returns if can be rewound (positioned * to an offset of zero), in order to read it a second * time. */ static const char * sqascii_GetError(const ESL_SQFILE *sqfp) { return sqfp->data.ascii.errbuf; } /***************************************************************** *# 4. Sequence reading (sequential) *****************************************************************/ /* Function: sqascii_Read() * Synopsis: Read the next sequence from a file. * * Purpose: Reads the next sequence from open sequence file into * . Caller provides an allocated and initialized , which * will be internally reallocated if its space is insufficient. * * Returns: on success; the new sequence is stored in . * * Returns when there is no sequence left in the * file (including first attempt to read an empty file). * * Returns if there's a problem with the format, * such as an illegal character; the line number that the parse * error occurs on is in linenumber>, and an informative * error message is placed in errbuf>. * * Throws: on allocation failure; * on internal error. */ static int sqascii_Read(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; int64_t epos; int64_t n; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (esl_sqio_IsAlignment(sqfp->format)) { ESL_SQ *tmpsq = NULL; if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq) { /* we need to load a new alignment? */ esl_msa_Destroy(ascii->msa); status = esl_msafile_Read(ascii->afp, &(ascii->msa)); if (status == eslEFORMAT) { /* oops, a parse error; upload the error info from afp to sqfp */ ascii->linenumber = ascii->afp->linenumber; strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */ return eslEFORMAT; } if (status != eslOK) return status; ascii->idx = 0; } /* grab next seq from alignment */ /* this is inefficient; it goes via a temporarily allocated copy of the sequence */ if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status; esl_sq_GrowTo(sq, tmpsq->n); esl_sq_Copy(tmpsq, sq); esl_sq_Destroy(tmpsq); ascii->idx++; sq->start = 1; sq->end = sq->n; sq->C = 0; sq->W = sq->n; sq->L = sq->n; return eslOK; } /* Main case: read next seq from sqfp's stream */ if (ascii->nc == 0) return eslEOF; if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EMEM, EOF, EFORMAT */ do { if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status; if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM; addbuf(sqfp, sq, n); ascii->L += n; sq->eoff = ascii->boff + epos - 1; if (status == eslEOD) break; } while ((status = loadbuf(sqfp)) == eslOK); if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?"); if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; } else if (status == eslEOD) { ascii->bpos = epos; if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; } else if (status != eslOK) return status; if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL; else sq->seq[sq->n] = '\0'; sq->start = 1; sq->end = sq->n; sq->C = 0; sq->W = sq->n; sq->L = sq->n; return eslOK; } /* Function: sqascii_ReadInfo() * Synopsis: Read sequence info, but not the sequence itself. * * Purpose: Read the next sequence from open sequence file , * but don't store the sequence (or secondary structure). * Upon successful return, holds all the available * information about the sequence -- its name, accession, * description, and overall length L>. * * This is useful for indexing sequence files, where * individual sequences might be ginormous, and we'd rather * avoid reading complete seqs into memory. * * Returns: on success. * * Throws: on allocation error. */ static int sqascii_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; int64_t epos; int64_t n; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (esl_sqio_IsAlignment(sqfp->format)) { ESL_SQ *tmpsq = NULL; if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq) { /* we need to load a new alignment? */ esl_msa_Destroy(ascii->msa); status = esl_msafile_Read(ascii->afp, &(ascii->msa)); if (status == eslEFORMAT) { /* oops, a parse error; upload the error info from afp to sqfp */ ascii->linenumber = ascii->afp->linenumber; strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */ return eslEFORMAT; } if (status != eslOK) return status; ascii->idx = 0; } /* grab next seq from alignment */ /* this is inefficient; it goes via a temporarily allocated copy of the sequence */ if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status; if (tmpsq->dsq != NULL) tmpsq->dsq[1] = eslDSQ_SENTINEL; else tmpsq->seq[0] = '\0'; esl_sq_Copy(tmpsq, sq); esl_sq_Destroy(tmpsq); ascii->idx++; if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; else sq->seq[0] = '\0'; if (sq->ss != NULL) { free(sq->ss); sq->ss = NULL; } sq->n = 0; sq->start = 0; sq->end = 0; sq->C = 0; sq->W = 0; return eslOK; } if (ascii->nc == 0) return eslEOF; if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */ ascii->L = 0; do { status = seebuf(sqfp, -1, &n, &epos); ascii->L += n; sq->eoff = ascii->boff + epos - 1; if (status == eslEFORMAT) return status; if (status == eslEOD) break; } while ((status = loadbuf(sqfp)) == eslOK); if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?"); } else if (status == eslEOD) { ascii->bpos = epos; if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; } else if (status != eslOK) return status; sq->L = ascii->L; /* Set coord system for an info-only ESL_SQ */ if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; else sq->seq[0] = '\0'; if (sq->ss != NULL) { free(sq->ss); sq->ss = NULL; } sq->n = 0; sq->start = 0; sq->end = 0; sq->C = 0; sq->W = 0; return eslOK; } /* Function: sqascii_ReadSequence() * Synopsis: Read the next sequence from a file. * * Purpose: Reads the next sequence from open sequence file into * . Caller provides an allocated and initialized , which * will be internally reallocated if its space is insufficient. * * Returns: on success; the new sequence is stored in . * * Returns when there is no sequence left in the * file (including first attempt to read an empty file). * * Returns if there's a problem with the format, * such as an illegal character; the line number that the parse * error occurs on is in linenumber>, and an informative * error message is placed in errbuf>. * * Throws: on allocation failure; * on internal error. */ static int sqascii_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; int64_t epos; int64_t n; int status; if (esl_sqio_IsAlignment(sqfp->format)) { ESL_SQ *tmpsq = NULL; if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq) { /* we need to load a new alignment? */ esl_msa_Destroy(ascii->msa); status = esl_msafile_Read(ascii->afp, &(ascii->msa)); if (status == eslEFORMAT) { /* oops, a parse error; upload the error info from afp to sqfp */ ascii->linenumber = ascii->afp->linenumber; strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */ return eslEFORMAT; } if (status != eslOK) return status; ascii->idx = 0; } /* grab next seq from alignment */ /* this is inefficient; it goes via a temporarily allocated copy of the sequence */ status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq); // eslEMEM | eslEOD if (status != eslOK) return status; esl_sq_GrowTo(sq, tmpsq->n); esl_sq_Copy(tmpsq, sq); esl_sq_Destroy(tmpsq); ascii->idx++; sq->start = 1; sq->end = sq->n; sq->C = 0; sq->W = sq->n; sq->L = sq->n; return eslOK; } /* Main case: read next seq from sqfp's stream */ if (ascii->nc == 0) return eslEOF; if ((status = ascii->skip_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */ do { if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status; if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM; addbuf(sqfp, sq, n); ascii->L += n; sq->eoff = ascii->boff + epos - 1; if (status == eslEOD) break; } while ((status = loadbuf(sqfp)) == eslOK); if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?"); if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; } else if (status == eslEOD) { ascii->bpos = epos; if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; } else if (status != eslOK) return status; if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL; else sq->seq[sq->n] = '\0'; sq->start = 1; sq->end = sq->n; sq->C = 0; sq->W = sq->n; sq->L = sq->n; return eslOK; } /* Function: sqascii_ReadWindow() * Synopsis: Read next window of sequence. * * Purpose: Read a next window of residues from open file , * keeping residues from the previous window as * context, and keeping previous annotation in the * as before. * * If this is the first window of a new sequence record, * is ignored (there's no previous context yet), and * the annotation fields of the (name, accession, and * description) are initialized by reading the sequence * record's header. This is the only time the annotation * fields are initialized. * * On return, dsq[]> contains the window and its * context; residues <1..sq->C> are the previous context, * and residues C+1..sq->n> are the new window. The * start and end coordinates of the whole * (including context) in the original source sequence are * start..sq->end>. (Or, for text mode sequences, * seq[0..sq->C-1,sq->C..sq->n-1]>, while and * coords are still <1..L>.) * * When a sequence record is completed and no more data * remain, is returned, with an ``info'' * structure (containing the annotation and the total * sequence length , but no sequence). (The total * sequence length is unknown in until this * return.) * * The caller may then do one of two things before calling * again; it can reset the sequence * with to continue reading the next * sequence in the file, or it can set a negative as a * signal to read windows from the reverse complement * (Crick) strand. Reverse complement reading only works * for nucleic acid sequence. * * If you read the reverse complement strand, you must read * the whole thing, calling with * negative windows until is returned again * with an empty (info-only) structure. When that * is reached, the is repositioned at the * start of the next sequence record; the caller should now * the , and the next * call must have a positive , corresponding to starting * to read the Watson strand of the next sequence. * * Note that the interface is designed for * an idiom of sequential reading of complete sequences in * overlapping windows, possibly on both strands; if you * want more freedom to move around in the sequence * grabbing windows in another order, you can use the * interface. * * Reading the reverse complement strand requires file * repositioning, so it will not work on non-repositionable * streams like gzipped files or a stdin pipe. Moreover, * for reverse complement input to be efficient, the * sequence file should have consistent line lengths, * suitable for SSI's fast subsequence indexing. * * Returns: on success; now contains next window of * sequence, with at least 1 new residue. The number * of new residues is W>; C> residues are * saved from the previous window. Caller may now * process residues dsq[sq->C+1]..sq->dsq[sq->n]>. * * if no new residues were read for this sequence * and strand, and now contains an empty info-only * structure (annotation and are valid). Before calling * again, caller will either want * to make negative (to start reading the Crick strand * of the current sequence), or it will want to reset the * (with ) to go on the next sequence. * * if we've already returned before to * signal the end of the previous seq record, and moreover, * there's no more sequence records in the file. * * if an invalid residue is found in the * sequence, or if you attempt to take the reverse * complement of a sequence that can't be reverse * complemented. * * Throws: if you try to read a reverse window before * you've read forward strand. * * if something goes awry internally in the * coordinate system. * * on allocation error. */ static int sqascii_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq) { int actual_start; int64_t nres; int64_t line; off_t offset; int status; ESL_SQ *tmpsq = NULL; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (esl_sqio_IsAlignment(sqfp->format)) { /* special: if we're initializing a revcomp window read, back ascii->idx up one */ if (W < 0 && sq->start == 0) ascii->idx--; if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq) { /* need new alignment? */ esl_msa_Destroy(ascii->msa); status = esl_msafile_Read(ascii->afp, &(ascii->msa)); if (status == eslEFORMAT) { /* oops, a parse error; upload the error info from afp to sqfp */ ascii->linenumber = ascii->afp->linenumber; strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */ return eslEFORMAT; } else if (status != eslOK) goto ERROR; ascii->idx = 0; } /* grab appropriate seq from alignment into tmpsq */ if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) goto ERROR; /*by default, tmpsq is an ascii sequence, convert it to digital if that's what sq is*/ if (sq->seq == NULL && (( status = esl_sq_Digitize(sq->abc, tmpsq)) != eslOK)) goto ERROR; /* Figure out tmpsq coords we'll put in sq */ if (W > 0) {/* forward strand */ sq->C = ESL_MIN(sq->n, C); sq->start = sq->end - sq->C + 1; sq->end = ESL_MIN(tmpsq->L, sq->end + W); sq->n = sq->end - sq->start + 1; sq->W = sq->n - sq->C; } else {/* reverse strand */ if (sq->L == -1) ESL_XEXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand"); sq->C = ESL_MIN(sq->n, sq->end + C - 1); sq->end = (sq->start == 0 ? sq->L : sq->end + sq->C - 1); sq->start = ESL_MAX(1, sq->end + W - sq->C - 1); sq->n = sq->end - sq->start + 1; sq->W = sq->n - sq->C; } if (sq->W == 0)/* no new sequence? that's the EOD case */ { sq->start = 0; sq->end = 0; sq->C = 0; sq->W = 0; sq->n = 0; sq->L = tmpsq->L; if (sq->dsq) sq->dsq[1] = eslDSQ_SENTINEL; else if (sq->seq) sq->seq[0] = '\0'; ascii->idx++; esl_sq_Destroy(tmpsq); return eslEOD; } /* Copy the sequence frag. */ if (tmpsq->ss != NULL && sq->ss == NULL) ESL_ALLOC(sq->ss, sizeof(char) * (sq->salloc)); /* this *must* be for salloc */ esl_sq_GrowTo(sq, sq->n); if (tmpsq->seq != NULL) {/* text mode */ memcpy(sq->seq, tmpsq->seq + sq->start - 1, sizeof(char) * sq->n); sq->seq[sq->n] = '\0'; if (tmpsq->ss != NULL) { memcpy(sq->ss, tmpsq->ss + sq->start - 1, sizeof(char) * sq->n); sq->ss[sq->n] = '\0'; } } else { memcpy(sq->dsq + 1, tmpsq->dsq + sq->start, sizeof(ESL_DSQ) * sq->n); sq->dsq[sq->n+1] = eslDSQ_SENTINEL; if (tmpsq->ss != NULL) { memcpy(sq->ss + 1, tmpsq->ss + sq->start, sizeof(char) * sq->n); sq->ss[sq->n+1] = '\0'; } } if (W < 0 && (status = esl_sq_ReverseComplement(sq)) != eslOK) ESL_XFAIL(eslEINVAL, ascii->errbuf, "Can't reverse complement that sequence window"); /* Copy annotation */ if ((status = esl_sq_SetName (sq, tmpsq->name)) != eslOK) goto ERROR; if ((status = esl_sq_SetSource (sq, tmpsq->name)) != eslOK) goto ERROR; if ((status = esl_sq_SetAccession(sq, tmpsq->acc)) != eslOK) goto ERROR; if ((status = esl_sq_SetDesc (sq, tmpsq->desc)) != eslOK) goto ERROR; sq->roff = -1; sq->doff = -1; sq->eoff = -1; sq->hoff = -1; esl_sq_Destroy(tmpsq); return eslOK; } /* Now for the normal case: we're reading a normal unaligned seq file, not an alignment. */ /* Negative W indicates reverse complement direction */ if (W < 0) { if (sq->L == -1) ESL_EXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand"); if (sq->end == 1 || sq->L == 0) { /* last end == 1 means last window was the final one on reverse strand, * so we're EOD; jump back to last forward position. * * Also check for the unusual case of sq->L == 0, a completely empty sequence: * in that case, immediately return eslEOD. */ if (ascii->bookmark_offset > 0) { if (esl_sqfile_Position(sqfp, ascii->bookmark_offset) != eslOK) ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file at last forward bookmark"); ascii->linenumber = ascii->bookmark_linenum; } else ascii->nc = 0; /* signals EOF */ ascii->bookmark_offset = 0; ascii->bookmark_linenum = 0; sq->start = 0; sq->end = 0; sq->C = 0; sq->W = 0; sq->n = 0; /* sq->L stays as it is */ if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; else sq->seq[0] = '\0'; return eslEOD; } /* If sq->start == 0, we haven't read any reverse windows yet; * init reading from sq->L */ W = -W; if (sq->start == 0) { sq->start = ESL_MAX(1, (sq->L - W + 1)); sq->end = sq->L; sq->C = 0; sq->W = sq->end - sq->start + 1; ascii->curbpl = -1; ascii->currpl = -1; ascii->prvbpl = -1; ascii->prvrpl = -1; ascii->linenumber = -1; ascii->L = -1; } else { /* Else, we're continuing to next window; prv was .. */ sq->C = ESL_MIN(C, sq->L - sq->end + 1); /* based on prev window's end */ sq->end = sq->end + sq->C - 1; /* also based on prev end */ sq->start = ESL_MAX(1, (sq->end - W - sq->C + 1)); sq->W = sq->end - sq->start + 1 - sq->C; } /* Now position for a subseq fetch of on fwd strand, using SSI offset calc */ ESL_DASSERT1(( sq->doff != 0 )); if (ascii->bpl == 0 || ascii->rpl == 0) /* no help; brute force resolution. */ { offset = sq->doff; actual_start = 1; } else if (ascii->bpl == ascii->rpl+1) /* residue resolution */ { line = (sq->start-1) / ascii->rpl; /* data line #0.. that is on */ offset = sq->doff + line * ascii->bpl + (sq->start-1)%ascii->rpl; actual_start = sq->start; } else /* line resolution */ { line = (sq->start-1) / ascii->rpl; /* data line #0.. that is on */ offset = sq->doff + line * ascii->bpl; actual_start = 1 + line * ascii->rpl; } if (esl_sqfile_Position(sqfp, offset) != eslOK) ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file for reverse window read"); /* grab the subseq and rev comp it */ if ((status = esl_sq_GrowTo(sq, sq->C+sq->W)) != eslOK) return status; sq->n = 0; status = read_nres(sqfp, sq, (sq->start - actual_start), (sq->end - sq->start + 1), &nres); if (status != eslOK || nres < (sq->end - sq->start + 1)) ESL_EXCEPTION(eslECORRUPT, "Failed to extract %d..%d", sq->start, sq->end); status = esl_sq_ReverseComplement(sq); if (status == eslEINVAL) ESL_FAIL(eslEINVAL, ascii->errbuf, "can't reverse complement that seq - it's not DNA/RNA"); else if (status != eslOK) return status; return eslOK; } /* Else, we're reading the forward strand */ else { /* sq->start == 0 means we haven't read any windows on this sequence yet... * it's a new record, and we need to initialize with the header and * the first window. This is the only case that we're allowed to return * EOF from. */ if (sq->start == 0) { if (ascii->nc == 0) return eslEOF; if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */ sq->start = 1; sq->C = 0;/* no context in first window */ sq->L = -1;/* won't be known 'til EOD. */ ascii->L = 0;/* init to 0, so we can count residues as we go */ esl_sq_SetSource(sq, sq->name); /* the buf> is now positioned at the start of seq data */ /* ascii->linenumber is ok where it is */ /* the header_*() routines initialized rpl,bpl bookkeeping at start of seq line, * and also sq->doff,roff. */ } else { /* else we're reading a window other than first; slide context over. */ sq->C = ESL_MIN(C, sq->n); /* if the case where the window is smaller than the context and the * context is not full, it is not necessary to move the context part * of the sequence that has been read in. */ if (sq->C >= C) { /* now handle the case where the context is full */ if (sq->seq != NULL) memmove(sq->seq, sq->seq + sq->n - sq->C, sq->C); else memmove(sq->dsq+1, sq->dsq + sq->n - sq->C + 1, sq->C); sq->start = ascii->L - sq->C + 1; sq->n = C; } } if ((status = esl_sq_GrowTo(sq, C+W)) != eslOK) return status; /* EMEM */ status = read_nres(sqfp, sq, 0, W, &nres); ascii->L += nres; if (status == eslEOD) { /* Forward strand is done. 0 residues were read. Return eslEOD and an empty (info) . */ if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status; sq->start = 0; sq->end = 0; sq->C = 0; sq->W = 0; sq->L = ascii->L; sq->n = 0; if (ascii->nc > 0) { ascii->bookmark_offset = ascii->boff+ascii->bpos; /* remember where the next seq starts. */ //ascii->bookmark_linenum = ascii->bookmark_linenum; } else { ascii->bookmark_offset = 0; /* signals for EOF, no more seqs */ ascii->bookmark_linenum = 0; } if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; /* erase the saved context */ else sq->seq[0] = '\0'; return eslEOD; } else if (status == eslOK) { /* Forward strand is still in progress. <= W residues were read. Return eslOK. */ sq->end = sq->start + sq->C + nres - 1; sq->W = nres; return eslOK; } else return status;/* EFORMAT,EMEM */ } /*NOTREACHED*/ return eslOK; ERROR: if (tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; } /* Function: sqascii_ReadBlock() * Synopsis: Read the next block of sequences from a file. * * Purpose: Reads a block of sequences from open sequence file into * . * * In the case that is false, the sequences are * expected to be protein - individual sequences won't be long * so read them in one-whole-sequence at a time. If is set * to a number > 0 read sequences, up to at most * MAX_RESIDUE_COUNT residues. value is irrelevant * if is false. * * If is true, the sequences are expected to be DNA. * Because sequences in a DNA database can exceed MAX_RESIDUE_COUNT, * this function uses ReadWindow to read chunks of sequence no * larger than , and must allow for the possibility that a * request will be made to continue reading a partly-read * sequence. This case also respects the limit. * * If is true and is TRUE, * the first window read from each sequence (of length L) * is always min(L, ). If * is FALSE, then the length of the first window read from * each sequence is calculated differently as * max( - , * .05); * where is total number of residues already existing * in the block. == TRUE mode was added * to ensure that the window boundaries read are not dependent * on the order of the sequence in the file, thus ensuring * reproducibility if (for example) a user extracts one * sequence from a file and reruns a program on it (and all * else remains equal). * * Returns: on success; the new sequence is stored in . * * Returns when there is no sequence left in the * file (including first attempt to read an empty file). * * Returns if there's a problem with the format, * such as an illegal character; the line number that the parse * error occurs on is in linenumber>, and an informative * error message is placed in errbuf>. * * Throws: on allocation failure; * on internal error. */ static int sqascii_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int max_init_window, int long_target) { int i = 0; int size = 0; int status = eslOK; ESL_SQ *tmpsq = NULL; sqBlock->count = 0; if (max_sequences < 1 || max_sequences > sqBlock->listSize) max_sequences = sqBlock->listSize; if ( !long_target ) { /* in these cases, an individual sequence won't ever be really long, so just read in a sequence at a time */ for (i = 0; i < max_sequences && size < MAX_RESIDUE_COUNT; ++i) { status = sqascii_Read(sqfp, sqBlock->list + i); if (status != eslOK) break; size += sqBlock->list[i].n; ++sqBlock->count; } } else { /* DNA, not an alignment. Might be really long sequences */ if (max_residues < 1) max_residues = MAX_RESIDUE_COUNT; tmpsq = esl_sq_CreateDigital(sqBlock->list->abc); //if complete flag is set to FALSE, then the prior block must have ended with a window that was a possibly //incomplete part of it's full sequence. Read another overlapping window. if (! sqBlock->complete ) { //overloading C as indicator of how big C should be for this window reading action status = sqascii_ReadWindow(sqfp, sqBlock->list->C, max_residues, sqBlock->list); if (status == eslOK) { sqBlock->count = i = 1; size = sqBlock->list->n - sqBlock->list->C; sqBlock->list->L = sqfp->data.ascii.L; if (size == max_residues) { // Filled the block with a single very long window. sqBlock->complete = FALSE; // default value, unless overridden below status = skip_whitespace(sqfp); if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen sqBlock->complete = TRUE; status = eslOK; } if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; } else { // Burn off EOD (see notes for similar entry ~25 lines below), then go fetch the next sequence esl_sq_Reuse(tmpsq); tmpsq->start = sqBlock->list->start ; tmpsq->C = 0; status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq); if (status != eslEOD) { if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; //surprising } //sqBlock->list->L = tmpsq->L; } } else if (status == eslEOD) { // turns out there isn't any more of the sequence to read, after all } else { if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; } } // otherwise, just start at the beginning for ( ; i < max_sequences && size < max_residues; ++i) { /* restricted request_size is used to ensure that all blocks are pretty close to the * same size. Without it, we may either naively keep asking for max_residue windows, * which can result in a window with ~2*max_residues ... or we can end up with absurdly * short fragments at the end of blocks */ int request_size = (max_init_window) ? max_residues : ESL_MAX(max_residues-size, max_residues * .05); esl_sq_Reuse(tmpsq); esl_sq_Reuse(sqBlock->list + i); status = sqascii_ReadWindow(sqfp, 0, request_size , tmpsq); esl_sq_Copy(tmpsq, sqBlock->list +i); if (status != eslOK && status != eslEOD){ break; } /* end of sequences (eslEOF), or we read an empty seq (eslEOD) or error (other) */ size += sqBlock->list[i].n - sqBlock->list[i].C; sqBlock->list[i].L = sqfp->data.ascii.L; ++(sqBlock->count); if (size >= max_residues) { // a full window worth of sequence has been read; did we reach the end of the final sequence in the block? sqBlock->complete = FALSE; // default value, unless overridden below status = skip_whitespace(sqfp); if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen sqBlock->complete = TRUE; status = eslOK; } if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; } else if(status == eslEOD) { /* We've read an empty sequence of length 0, rare, but * possible, and we need to be able to handle it * gracefully. Ensure L is 0, set status to eslOK and move * on, we've already incremented sqBlock->count by 1 * above. This means our block may contain zero-length * sequences when we return (that is, we still add these * seqs onto the block instead of skipping them altogether). */ sqBlock->list[i].L = 0; /* actually, this should already be 0... */ status = eslOK; } else { /* Sequence finished, but haven't yet reached max_residues. Need to burn off the EOD value that will be returned by the next ReadWindow call. Can just use a tmp sq, after setting a couple values ReadWindow needs to see for correct processing. */ esl_sq_Reuse(tmpsq); tmpsq->start = sqBlock->list[i].start ; tmpsq->C = 0; status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq); if (status != eslEOD) { if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; //surprising } //sqBlock->list[i].L = tmpsq->L; status = eslOK; } } } /* EOF will be returned only in the case were no sequences were read */ if (status == eslEOF && i > 0) status = eslOK; sqBlock->complete = TRUE; if(tmpsq != NULL) esl_sq_Destroy(tmpsq); return status; } /* Function: sqascii_Echo() * Synopsis: Echo a sequence's record onto output stream. * * Purpose: Given a complete that we have read by some means * from an open ; echo that sequence's record * onto the output stream . * * This allows records to be regurgitated exactly as they * appear, rather than writing the subset of information * stored in an . in the miniapps uses * this, for example. * * Because this relies on repositioning the , it * cannot be called on non-positionable streams (stdin or * gzipped files). Because it relies on the sequence lying * in a contiguous sequence of bytes in the file, it cannot * be called on a sequence in a multiple alignment file. * Trying to do so throws an exception. * * Returns: on success. * * Throws: if isn't a repositionable sequence file. * if we run out of data, probably from bad offsets * on allocation failure. * on system call failures. * * */ static int sqascii_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp) { int status; int64_t save_linenumber; int save_currpl; int save_curbpl; int save_prvrpl; int save_prvbpl; int64_t save_L; int n; int nwritten; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from standard input"); if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from a gzipped file"); if (esl_sqio_IsAlignment(sqfp->format)) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from an alignment file"); if (sq->roff == -1 || sq->eoff == -1) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence without disk offset info"); save_linenumber = ascii->linenumber; save_currpl = ascii->currpl; save_curbpl = ascii->curbpl; save_prvrpl = ascii->prvrpl; save_prvbpl = ascii->prvbpl; save_L = ascii->L; status = esl_sqfile_Position(sqfp, sq->roff); if (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?"); else if (status != eslOK) return status; while (ascii->boff + ascii->nc <= sq->eoff) { if (fwrite(ascii->buf, sizeof(char), ascii->nc, ofp) != ascii->nc) ESL_EXCEPTION(eslESYS, "fwrite() failed"); if (loadbuf(sqfp) != eslOK) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?"); } n = sq->eoff - ascii->boff + 1; nwritten = fwrite(ascii->buf, sizeof(char), n, ofp); if (nwritten != n) ESL_EXCEPTION(eslESYS, "fwrite() failed"); status = esl_sqfile_Position(sqfp, sq->roff); if (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?"); else if (status != eslOK) return status; ascii->linenumber = save_linenumber; ascii->currpl = save_currpl; ascii->curbpl = save_curbpl; ascii->prvrpl = save_prvrpl; ascii->prvbpl = save_prvbpl; ascii->L = save_L; return eslOK; } /*------------------ end, sequential sequence input -------------*/ /***************************************************************** *# 5. Sequence/subsequence fetching, random access [with ] *****************************************************************/ /* Function: sqascii_OpenSSI() * Synopsis: Opens an SSI index associated with a sequence file. * * Purpose: Opens an SSI index file associated with the already open * sequence file . If successful, the necessary * information about the open SSI file is stored internally * in . * * The SSI index file name is determined in one of two * ways, depending on whether a non- * is provided. * * If is , the default for * constructing the SSI filename from the sequence * filename, by using exactly the same path (if any) for * the sequence filename, and appending the suffix <.ssi>. * For example, the SSI index for is , for * <./foo.fa> is <./foo.fa.ssi>, and for * is . * * If is , this exact fully * qualified path is used as the SSI file name. * * Returns: on success, and ssi> is now internally * valid. * * if no SSI index file is found; * if it's found, but appears to be in incorrect format; * if the SSI file uses 64-bit offsets but we're on * a system that doesn't support 64-bit file offsets. * * Throws: if the open sequence file doesn't * correspond to a normal sequence flatfile -- we can't * random access in .gz compressed files, standard input, * or multiple alignment files that we're reading * sequentially. * * Throws on allocation error. */ static int sqascii_OpenSSI(ESL_SQFILE *sqfp, const char *ssifile_hint) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for a .gz compressed seq file"); if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for standard input"); if (ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for sequential input from an MSA"); if (ssifile_hint == NULL) { if ((status = esl_strdup(sqfp->filename, -1, &(ascii->ssifile))) != eslOK) return status; if ((status = esl_strcat(&(ascii->ssifile), -1, ".ssi", 4)) != eslOK) return status; } else { if ((status = esl_strdup(ssifile_hint, -1, &(ascii->ssifile))) != eslOK) return status; } return esl_ssi_Open(ascii->ssifile, &(ascii->ssi)); } /* Function: sqascii_PositionByKey() * Synopsis: Use SSI to reposition seq file to a particular sequence. * * Purpose: Reposition so that the next sequence we read will * be the one named (or accessioned) . * * linenumber> is reset to be relative to the start * of the record named , rather than the start of the * file. * * Returns: on success, and the file is repositioned * so that the next call will read the * sequence named . * * Returns if isn't found in the * index; in this case, the position of in the file * is unchanged. * * Returns if something goes wrong trying to * read the index, almost certainly indicating a format * problem in the SSI file. * * Returns if, after repositioning, we fail to * load the next line or buffer from the sequence file; * this probably also indicates a format problem in the SSI * file. * * Throws: on allocation error; * if there's no open SSI index in ; * if the fails. * * In all these cases, the state of becomes * undefined, and the caller should not use it again. */ static int sqascii_PositionByKey(ESL_SQFILE *sqfp, const char *key) { uint16_t fh; off_t offset; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->ssi == NULL) ESL_EXCEPTION(eslEINVAL,"Need an open SSI index to call esl_sqfile_PositionByKey()"); if ((status = esl_ssi_FindName(ascii->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status; return esl_sqfile_Position(sqfp, offset); } /* Function: sqascii_PositionByNumber() * Synopsis: Use SSI to reposition by sequence number * * Purpose: Reposition so that the next sequence we * read will be the 'th sequence, where * is <0..sqfp->ssi->nprimary-1>. * * linenumber> is reset to be relative to the start * of the record named , rather than the start of the * file. * * Returns: on success, and the file is repositioned. * * Returns if there is no sequence number * in the index; in this case, the position of * in the file is unchanged. * * Returns if something goes wrong trying to * read the index, almost certainly indicating a format * problem in the SSI file. * * Returns if, after repositioning, we fail to * load the next line or buffer from the sequence file; * this probably also indicates a format problem in the SSI * file. * * Throws: on allocation error; * if there's no open SSI index in ; * if the fails. * * In all these cases, the state of becomes * undefined, and the caller should not use it again. */ static int sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which) { uint16_t fh; off_t offset; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->ssi == NULL) ESL_EXCEPTION(eslEINVAL,"Need open SSI index to call esl_sqfile_PositionByNumber()"); if ((status = esl_ssi_FindNumber(ascii->ssi, which, &fh, &offset, NULL, NULL, NULL)) != eslOK) return status; return esl_sqfile_Position(sqfp, offset); } /* Function: sqascii_Fetch() * Synopsis: Fetch a complete sequence, using SSI indexing. * * Purpose: Fetch a sequence named (or accessioned) from * the repositionable, open sequence file . * The open must have an open SSI index. * The sequence is returned in . * * Returns: on soccess. * if no SSI index is present, or if can't * be repositioned. * if isn't found in the file. * if either the index file or the sequence file * can't be parsed, because of unexpected format issues. * * Throws: on allocation error. */ static int sqascii_Fetch(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename); if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status; if ((status = sqascii_Read(sqfp, sq)) != eslOK) return status; return eslOK; } /* Function: sqascii_FetchInfo() * Synopsis: Fetch a sequence's info, using SSI indexing. * * Purpose: Fetch a sequence named (or accessioned) from * the repositionable, open sequence file , reading * all info except the sequence (and secondary structure). * The open must have an open SSI index. * The sequence info is returned in . * * Returns: on soccess. * if no SSI index is present, or if can't * be repositioned. * if isn't found in the file. * if either the index file or the sequence file * can't be parsed, because of unexpected format issues. * * Throws: on allocation error. */ static int sqascii_FetchInfo(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename); if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status; if ((status = sqascii_ReadInfo(sqfp, sq)) != eslOK) return status; return eslOK; } /* Function: sqascii_FetchSubseq() * Synopsis: Fetch a subsequence, using SSI indexing. * * Purpose: Fetch subsequence from a sequence named (or * accessioned) , in the repositionable, open sequence file . * The open must have an SSI index. Put the * subsequence in . * * As a special case, if is 0, the subsequence is * fetched all the way to the end, so you don't need to * look up the sequence length to fetch a suffix. * * The caller may want to rename/reaccession/reannotate the * subsequence. Upon successful return, name> is set * to , and source> is set to * The accession and description acc> and * desc> are set to the accession and description of * the source sequence. * * Returns: on success. * if no SSI index is present, or if can't * be repositioned. * if isn't found in the file. * if either the index file or the sequence file * can't be parsed, because of unexpected format issues. * if the coords don't lie entirely * within the sequence. * * Throws: on allocation errors. */ static int sqascii_FetchSubseq(ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq) { uint16_t fh;/* SSI file handle */ off_t r_off, d_off; int64_t L; int64_t actual_start; int64_t nskip; int64_t nres; int64_t n; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename); /* Find sequence info in the index */ status = esl_ssi_FindSubseq(ascii->ssi, source, start, &fh, &r_off, &d_off, &L, &actual_start); if (status == eslENOTFOUND) ESL_FAIL(status, ascii->errbuf, "Didn't find sequence %s in the index", source); else if (status == eslEFORMAT) ESL_FAIL(status, ascii->errbuf, "Failure reading SSI index; corrupt or bad format"); else if (status == eslERANGE) ESL_FAIL(status, ascii->errbuf, "Requested start %" PRIi64 " isn't in the sequence %s", start, source); else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Unexpected failure in finding subseq offset"); /* The special case of end=0, asking for suffix fetch */ if (end == 0) end = L; /* Validate coords if we can */ if (start > end) ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence start %" PRIi64 " is greater than end %" PRIi64 "\n", start, end); if (L > 0 && end > L) ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence end %" PRIi64 " is greater than length %" PRIi64 "\n", end, L); /* Position the file at the record header; read the header info */ status = esl_sqfile_Position(sqfp, r_off); if (status == eslEOF) ESL_FAIL(status, ascii->errbuf, "Position appears to be off the end of the file"); else if (status == eslEINVAL) ESL_FAIL(status, ascii->errbuf, "Sequence file is not repositionable"); else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Failure in positioning sequence file"); if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* Position the file close to the subseq: either at the start of the line * where the subseq starts, or exactly at the residue. */ if (d_off != 0) { status = esl_sqfile_Position(sqfp, d_off); if (status == eslEOF) ESL_FAIL(eslERANGE, ascii->errbuf, "Position appears to be off the end of the file"); else if (status == eslEINVAL) ESL_FAIL(status, ascii->errbuf, "Sequence file is not repositionable"); else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Failure in positioning sequence file"); } /* even if we didn't have a data offset, we're positioned at the * start of the sequence anyway, because we parsed the full header */ nskip = start - actual_start; /* how many residues do we still need to skip to reach start */ nres = end - start + 1; /* how many residues do we need to read as subseq */ if ((status = esl_sq_GrowTo(sq, nres)) != eslOK) return status; status = read_nres(sqfp, sq, nskip, nres, &n); if (status != eslOK || n < nres) ESL_EXCEPTION(eslEINCONCEIVABLE, "Failed to fetch subsequence residues -- corrupt coords?"); /* Set the coords */ sq->start = start; sq->end = end; sq->C = 0; sq->W = sq->n; sq->L = (L > 0 ? L : -1); esl_sq_FormatName(sq, "%s/%" PRId64 "-%" PRId64, source, start, end); esl_sq_SetSource (sq, source); return eslOK; } /*------------- end, random sequence access with SSI -------------------*/ /***************************************************************** * 6. Internal routines shared by parsers *****************************************************************/ /* loadmem() * * Load the next block of data from stream into mem buffer, * either concatenating to previous buffer (if we're recording) or * overwriting (if not). * * This block is loaded at sqfp->mem + sqfp->mpos. * * Upon return: * sqfp->mem now contains up to eslREADBUFSIZE more chars * sqfp->mpos is position of first byte in newly read block * sqfp->allocm may have increased by eslREADBUFSIZE, if we concatenated * sqfp->mn is # of chars in ; is pos of last byte in new block * * Returns (and mpos == mn) if no new data can be read; * Returns (and mpos < mn) if new data is read. * Throws on allocation error. */ static int loadmem(ESL_SQFILE *sqfp) { void *tmp; int n = 0; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->do_buffer) { ascii->mpos = 0; ascii->mn = 0; } else if (ascii->is_recording == TRUE) { if (ascii->mem == NULL) ascii->moff = ftello(ascii->fp); /* first time init of the offset */ ESL_RALLOC(ascii->mem, tmp, sizeof(char) * (ascii->allocm + eslREADBUFSIZE)); ascii->allocm += eslREADBUFSIZE; n = fread(ascii->mem + ascii->mpos, sizeof(char), eslREADBUFSIZE, ascii->fp); ascii->mn += n; } else { if (ascii->mem == NULL) { ESL_ALLOC(ascii->mem, sizeof(char) * eslREADBUFSIZE); ascii->allocm = eslREADBUFSIZE; } ascii->is_recording = -1;/* no more recording is possible now */ ascii->mpos = 0; ascii->moff = ftello(ascii->fp); n = fread(ascii->mem, sizeof(char), eslREADBUFSIZE, ascii->fp); /* see note [1] below */ ascii->mn = n; } return (n == 0 ? eslEOF : eslOK); ERROR: return status; } /* [1] Be alert for a possible problem above in that fread(). * Farrar had inserted an alternative case as follows: * "If we are reading from stdin, buffered read cannot be used * because if will block until EOF or the buffer is full, ie * eslREADBUFSIZE characters have been read. Usually this would * not be a problem, unless stdin is from a pipe. In that case * if the sequence is less than eslREADBUFSIZE we would block. * * NOTE: any changes to the IO stream ascii->fp, such as fseek, * might not have any affect on the file descriptor for the stream. * * if (ascii->do_stdin) { * n = read(fileno(ascii->fp), ascii->mem, eslREADBUFSIZE); * } else { * ... * * but that's a bug, because you can't mix read and fread; * the i17-stdin.pl test fails, in particular. */ /* loadbuf() * Set sqfp->buf to contain next line of data, or point to next block. * This might just mean working with previously buffered memory in mem> * or might require reading new data from fp>. * * Reset sqfp->boff to be the position of the start of the block/line. * Reset sqfp->bpos to 0. * Reset sqfp->nc to the number of chars (bytes) in the new block/line. * Returns eslOK on success; eslEOF if there's no more data in the file. * (sqfp->nc == 0 is the same as eslEOF: no data in the new buffer.) * Can throw an error. */ static int loadbuf(ESL_SQFILE *sqfp) { void *tmp; char *nlp; int n; int status = eslOK; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (! ascii->is_linebased) { if (ascii->mpos >= ascii->mn) { if ((status = loadmem(sqfp)) == eslEMEM) return status; } ascii->buf = ascii->mem + ascii->mpos; ascii->boff = ascii->moff + ascii->mpos; ascii->balloc = 0; ascii->bpos = 0; ascii->nc = ascii->mn - ascii->mpos; ascii->mpos += ascii->nc; } else { /* Copy next line from into . Might require new load(s) into . */ if (ascii->mpos >= ascii->mn) { if ((status = loadmem(sqfp)) == eslEMEM) return status; } ascii->boff = ascii->moff + ascii->mpos; ascii->nc = 0; nlp = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos); while (nlp == NULL) { n = ascii->mn - ascii->mpos; while (ascii->nc + n + 1 > ascii->balloc) { /* +1: it'll hold the terminal \0 */ ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE)); ascii->balloc += eslREADBUFSIZE; } memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n); ascii->mpos += n; ascii->nc += n; status = loadmem(sqfp); if (status == eslEOF) { break; } else if (status != eslOK) return status; nlp = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos); } if (status != eslEOF) { n = nlp - (ascii->mem + ascii->mpos) + 1; /* inclusive of \n */ if (ascii->nc + n + 1 > ascii->balloc) { ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE)); ascii->balloc += eslREADBUFSIZE; } memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n); ascii->mpos += n; ascii->nc += n; } ascii->bpos = 0; ascii->buf[ascii->nc] = '\0'; } return (ascii->nc == 0 ? eslEOF : eslOK); ERROR: return status; } /* nextchar() * * Load next char from sqfp->buf into <*ret_c> and sets sqfp->bpos to * its position; usually this is c = sqfp->buf[++sqfp->bpos], but * we will refill the buffer w/ fresh fread() when needed, in which * case c = sqfp->buf[0] and sqfp->bpos = 0. * * Returns on success. * Return if we ran out of data in . * May throw an error. */ static int nextchar(ESL_SQFILE *sqfp, char *ret_c) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; ascii->bpos++; if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status; *ret_c = ascii->buf[ascii->bpos]; return eslOK; } /* seebuf() * * Examine and validate the current buffer buf> from its * current position bpos> until either the buffer ends (we run * out of characters) or the sequence data ends (we see whatever * character indicates EOD in this format) or we've seen * residues. If is passed as -1, parse the entire buffer, * without a residue limit. * * There are three possible outcomes: * : The buffer is all residues that belong to the current * seq we're parsing (or chars we can ignore), at least * up to the residue limit (if present). * : Part of the buffer may be residues, but the current sequence * ends in this buffer (before was reached). * : Somewhere before we reached the end of the buffer or * the sequence record, we saw an illegal character. * * On : * *opt_nres is the number of residues in the buffer (up to ) * *opt_endpos is sqfp->nc (off the end of the buffer by one) * The caller will want to deal with the buffer, then load the next one. * * On : same as OK, except: * *opt_endpos is where sqfp->bpos *would* be at when we saw the EOD * signal (the next '>', in FASTA files) had we been parsing residues * Therefore on EOD, the caller will want to deal with the <*opt_nres> * residues in this buffer, then reposition the buffer by * bpos = *opt_epos> (without reloading the buffer), so * the next read will pick up there. * * On : * ascii->errbuf contains informative message about the format error. * * seebuf() also handles linenumber and SSI bookkeeping in * . Every newline character seen increments (thus, * on EFORMAT return, linenumber is set to the line on which the bad * char occurred). ,,, keep track of # of bytes, * residues on the current,prev line; they keep state across calls to seebuf(). * , are tracking whether there's a constant number of * bytes/residues per line; these are either -1 for "not set yet", 0 * for "no, not constant", or a number > 0. Because of this bookkeeping, it's important * to make sure that never counts the same byte twice (hence * the need for the limit, which ReadWindow() uses.) */ static int seebuf(ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos) { int bpos; int64_t nres = 0; int64_t nres2 = 0;/* an optimization for determining lastrpl from nres, without incrementing lastrpl on every char */ int sym; ESL_DSQ x; int lasteol; int status = eslOK; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; lasteol = ascii->bpos - 1; if (maxn == -1) maxn = ascii->nc; /* makes for a more efficient test. nc is a guaranteed upper bound on nres */ for (bpos = ascii->bpos; nres < maxn && bpos < ascii->nc; bpos++) { sym = ascii->buf[bpos]; //printf ("nres: %d, bpos: %d (%d)\n", nres, bpos, sym); if (!isascii(sym)) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": non-ASCII character %c in sequence", ascii->linenumber, sym); x = sqfp->inmap[sym]; if (x <= 127) nres++; else if (x == eslDSQ_EOL) { if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol; if (ascii->currpl != -1) ascii->currpl += nres - nres2; nres2 += nres - nres2; if (ascii->rpl != 0 && ascii->prvrpl != -1) { /* need to treat counts on last line in record differently (can be shorter but not longer), hence cur/prv */ if (ascii->rpl == -1) ascii->rpl = ascii->prvrpl; /* init */ else if (ascii->prvrpl != ascii->rpl) ascii->rpl = 0; /* inval */ else if (ascii->currpl > ascii->rpl) ascii->rpl = 0; /* inval, this covers case when final line is longer */ } if (ascii->bpl != 0 && ascii->prvbpl != -1) { if (ascii->bpl == -1) ascii->bpl = ascii->prvbpl; /* init */ else if (ascii->prvbpl != ascii->bpl) ascii->bpl = 0; /* inval */ else if (ascii->curbpl > ascii->bpl) ascii->bpl = 0; /* inval, this covers case when final line is longer */ } ascii->prvbpl = ascii->curbpl; ascii->prvrpl = ascii->currpl; ascii->curbpl = 0; ascii->currpl = 0; lasteol = bpos; if (ascii->linenumber != -1) ascii->linenumber++; } else if (x == eslDSQ_ILLEGAL) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": illegal character %c", ascii->linenumber, sym); else if (x == eslDSQ_EOD) { status = eslEOD; break; } else if (x != eslDSQ_IGNORED) ESL_FAIL(eslEFORMAT, ascii->errbuf, "inmap corruption?"); } if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol - 1; if (ascii->currpl != -1) ascii->currpl += nres - nres2; if (opt_nres != NULL) *opt_nres = nres; if (opt_endpos != NULL) *opt_endpos = bpos; return status; } /* addbuf() * Add residues from the current buffer buf> to . * This is designed to work when we're constructing a complete * sequence (add the whole buffer); when we're adding a suffix * of the buffer (bpos> is skipped ahead already); * or when we're adding a prefix of the buffer (terminating a subseq * or window load). * * The caller must know that there are at least residues in * this buffer, and that all the characters are valid in the * format and alphabet, via a previous call to . * * The caller also must have already allocated to hold at least * more residues. * * On input: * sqfp->buf[] contains an fread() buffer * sqfp->bpos is set to where we're going to start parsing residues * sqfp->nc is the length of * * On return: * sqfp->buf[] still contains the same buffer (no new freads here) * sqfp->bpos is set after the last residue we parsed * sq->seq/dsq now holds new residues * sq->n is incremented by */ static void addbuf(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres) { ESL_DSQ x; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (sq->dsq != NULL) { while (nres) { x = sq->abc->inmap[(int) ascii->buf[ascii->bpos++]]; if (x <= 127) { nres--; sq->dsq[++sq->n] = x; } } /* we skipped IGNORED, EOL. EOD, ILLEGAL don't occur; seebuf() already checked */ } else { while (nres) { x = sqfp->inmap[(int) ascii->buf[ascii->bpos++]]; if (x <= 127) { nres--; sq->seq[sq->n++] = x; } } } } /* skipbuf() * Like addbuf(), but we skip residues instead of * reading them. */ static void skipbuf(ESL_SQFILE *sqfp, int64_t nskip) { ESL_DSQ x; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; while (nskip) { x = sqfp->inmap[(int) ascii->buf[ascii->bpos++]]; if (x <= 127) nskip--;/* skip IGNORED, EOL. */ } } /* skip_whitespace() * Like skipbuf(), but instead of skipping a fixed number of * residues, skip forward until one of three conditions is met: * * (1) end of the sequence record (a character indicating * the beginning of a new sequence); set ascii->bpos * to the beginning of the new record, and return eslEOD; * (2) a non-whitespace character in the current sequence is * reached that does not indicate the end of a sequence * record; set ascii->bpos to that character's position, * and return eslOK; * (3) end of file; return eslEOF. * */ static int skip_whitespace(ESL_SQFILE *sqfp) { int status; int c; ESL_DSQ x; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->nc == 0) return eslEOF; /* if at end of buffer, reload it */ if (ascii->bpos == ascii->nc) if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; c = (int) ascii->buf[ascii->bpos]; x = sqfp->inmap[c]; while ( isspace(c) ) { ascii->bpos++; /* if at end of buffer, reload it */ if (ascii->bpos == ascii->nc) if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; c = (int) ascii->buf[ascii->bpos]; x = sqfp->inmap[c]; } if (x == eslDSQ_EOD) return eslEOD; return eslOK; } /* read_nres() * Read the next residues from after skipping residues, then stop. * * Returns and <0 < *ret_actual_nres <= nres> if it succeeded, and * there's more residues in the current seq record. * Returns and <*ret_actual_nres == 0> if no more residues are * seen in the sequence record. * * Even on , the is appropriately terminated here, * and n> is left the way it was (no new residues added - but there * may have been saved context C from a previous window). * * Returns on any parsing problem, and errbuf> is set. * * On , sqfp->bpos is positioned on the next character past the last residue we store; * on , sqfp->bpos is positioned for reading the next sequence. * * FetchSubseq() uses this with , , and expects an * with <*opt_actual_nres = nres>. On , or if fewer than * residues are obtained, the coords must've been screwed up, * because we didn't read the whole subseq we asked for. * * ReadWindow() on forward strand uses this with , . * The last window might normally return with * <*ret_actual_nres == 0>, and now bpos> is positioned at the * start of the next sequence on , and at the next residue on * . * * ReadWindow() in reverse complement acts like a subseq fetch. * */ static int read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres) { int64_t n; int64_t epos; int64_t actual_nres = 0; int status = eslOK; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; status = seebuf(sqfp, nskip+nres, &n, &epos); while (status == eslOK && nskip - n > 0) { nskip -= n; if ((status = loadbuf(sqfp)) == eslEOF) break; status = seebuf(sqfp, nskip+nres, &n, &epos); } if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record"); if (nskip > 0) ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues"); n = 0; } else if (status == eslEOD) { if (n < nskip) ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues"); } else if (status != eslOK) return status; skipbuf(sqfp, nskip); n -= nskip; while (status == eslOK && nres - n > 0) { addbuf(sqfp, sq, n); actual_nres += n; nres -= n; if ((status = loadbuf(sqfp)) == eslEOF) break; status = seebuf(sqfp, nres, &n, &epos); } if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record"); n = 0; } else if (status == eslEFORMAT) { return status; } n = ESL_MIN(nres, n); addbuf(sqfp, sq, n); /* bpos now at last residue + 1 if OK/EOD, 0 if EOF */ actual_nres += n; if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL; else sq->seq[sq->n] = '\0'; if (status == eslEOD) { ascii->bpos = epos; } if (opt_actual_nres != NULL) *opt_actual_nres = actual_nres; return (actual_nres == 0 ? eslEOD : eslOK); } /*--------------- end, buffer-based parsers --------------------*/ /***************************************************************** *# 7. Internal routines for EMBL format (including UniProt, TrEMBL) *****************************************************************/ /* EMBL and UniProt protein sequence database format. * See: http://us.expasy.org/sprot/userman.html * and: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3 * We use the same parser for both formats, so we have to be * careful to only parse the conserved intersection of these two * very similar formats. */ static void config_embl(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; ascii->is_linebased = TRUE; ascii->eof_is_ok = FALSE;/* records end with // */ ascii->parse_header = &header_embl; ascii->skip_header = &skip_embl; ascii->parse_end = &end_embl; } static void inmap_embl(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap) { int x; if (abc_inmap != NULL) { for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x]; sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format } else { for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL; for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x; for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x; } for (x = '0'; x <= '9'; x++) sqfp->inmap[x] = eslDSQ_IGNORED; /* EMBL DNA sequence format puts coordinates after each line */ sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */ sqfp->inmap[' '] = eslDSQ_IGNORED; sqfp->inmap['\t'] = eslDSQ_IGNORED; sqfp->inmap['\n'] = eslDSQ_IGNORED; sqfp->inmap['\r'] = eslDSQ_IGNORED; /* DOS eol compatibility */ sqfp->inmap['/'] = eslDSQ_EOD; } /* header_embl() * * See: http://us.expasy.org/sprot/userman.html * And: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3 * Our parser must work on the highest common denominator of EMBL DNA * and UniProt protein sequence files. * * sqfp->buf is the first (ID) line of the entry, or a blank line before * it (in which case we'll scan forwards skipping blank lines to find * the ID line). * * On success, returns and: * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc) * sq->acc contains seq accession (and may have been reallocated, changing sq->aalloc) * sq->desc contains description line (and may have been reallocated, changing sq->dalloc) * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf is the first seq line. * * If no more seqs are found in the file, returns . * On parse failure, returns , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq) { char *s; char *tok; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* Find first line: * "Each entry must begin with an identification line (ID)..." * "The two-character line-type code that begins each line is always * followed by three blanks..." */ if (ascii->nc == 0) return eslEOF; while (esl_str_IsBlank(ascii->buf)) { if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */ else if (status != eslOK) return status; /* abnormal */ } /* ID line is defined as: * ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH. * We're only after the ENTRY_NAME. * Examples: * ID SNRPA_DROME STANDARD; PRT; 216 AA. * ID SNRPA_DROME Reviewed; 216 AA. * ID X06347; SV 1; linear; mRNA; STD; HUM; 1209 BP. */ if (strncmp(ascii->buf, "ID ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber); s = ascii->buf+5; if ((status = esl_strtok(&s, " ;", &tok)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on ID line", ascii->linenumber); if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status; sq->roff = ascii->boff;/* record the offset of the ID line */ /* Look for SQ line; parsing optional info as we go. */ do { if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber); /* "The format of the AC line is: * AC AC_number_1;[ AC_number_2;]...[ AC_number_N;] * Researchers who wish to cite entries in their publications * should always cite the first accession number. This is * commonly referred to as the 'primary accession * number'." * * Examples: * AC P43332; Q9W4D7; * AC X06347; * * Note that Easel only stores primary accessions. * Because there can be more than one accession line, we check to * see if the accession is already set before storing a line. */ if (strncmp(ascii->buf, "AC ", 5) == 0 && sq->acc[0] == '\0') { s = ascii->buf+5; if ((status = esl_strtok(&s, ";", &tok)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse accession on AC line", ascii->linenumber); if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status; } /* "The format of the DE line is: * DE Description. * ...In cases where more than one DE line is required, the text is * only divided between words and only the last DE line is * terminated by a period." * * Examples: * DE U1 small nuclear ribonucleoprotein A (U1 snRNP protein A) (U1-A) (Sex * DE determination protein snf). * * DE Human mRNA for U1 small nuclear RNP-specific A protein * * DE RecName: Full=U1 small nuclear ribonucleoprotein A; * DE Short=U1 snRNP protein A; * DE Short=U1-A; * DE AltName: Full=Sex determination protein snf; * * We'll make no attempt to parse the structured UniProt description header, * for the moment. */ if (strncmp(ascii->buf, "DE ", 5) == 0) { s = ascii->buf+5; esl_strchop(s, ascii->nc-5); if ((status = esl_sq_AppendDesc(sq, s)) != eslOK) ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse description on DE line", ascii->linenumber); } /* UniProt: "The format of the SQ line is: * SQ SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;" * EMBL: "The SQ (SeQuence header) line marks the beginning of * the sequence data and Gives a summary of its content. * An example is: * SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;" * * We don't parse this line; we just look for it as the last line * before the sequence starts. */ } while (strncmp(ascii->buf, "SQ ", 5) != 0); if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence"); sq->hoff = ascii->boff - 1; sq->doff = ascii->boff; return eslOK; } /* skip_embl() * * Skip past the EMBL header and position to start of the sequence line. * * On success, returns and: * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf is the first seq line. * * If no more seqs are found in the file, returns . * On parse failure, returns , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int skip_embl(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* Find first line: * "Each entry must begin with an identification line (ID)..." * "The two-character line-type code that begins each line is always * followed by three blanks..." */ if (ascii->nc == 0) return eslEOF; while (esl_str_IsBlank(ascii->buf)) { if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */ else if (status != eslOK) return status; /* abnormal */ } /* ID line */ if (strncmp(ascii->buf, "ID ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber); sq->roff = ascii->boff;/* record the offset of the ID line */ /* zero out the name, accession and description */ sq->name[0] = '\0'; sq->acc[0] = '\0'; sq->desc[0] = '\0'; /* Look for SQ line; parsing optional info as we go. */ do { if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber); } while (strncmp(ascii->buf, "SQ ", 5) != 0); if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence"); sq->hoff = ascii->boff - 1; sq->doff = ascii->boff; return eslOK; } static int end_embl(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber); sq->eoff = ascii->boff + ascii->nc - 1; status = loadbuf(sqfp); if (status == eslEOF) return eslOK; /* ok, actually. */ else if (status == eslOK) return eslOK; else return status; } /*---------------------- EMBL format ---------------------------------*/ /***************************************************************** *# 8. Internal routines for GenBank format *****************************************************************/ /* NCBI GenBank sequence database format. * See GenBank release notes; for example, * ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt */ static void config_genbank(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; ascii->is_linebased = TRUE; ascii->eof_is_ok = FALSE;/* records end with // */ ascii->parse_header = &header_genbank; ascii->skip_header = &skip_genbank; ascii->parse_end = &end_genbank; } static void inmap_genbank(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap) { int x; if (abc_inmap != NULL) { for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x]; sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format } else { for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL; for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x; for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x; } for (x = '0'; x <= '9'; x++) sqfp->inmap[x] = eslDSQ_IGNORED; sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */ sqfp->inmap[' '] = eslDSQ_IGNORED; sqfp->inmap['\t'] = eslDSQ_IGNORED; sqfp->inmap['\n'] = eslDSQ_IGNORED; sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */ sqfp->inmap['/'] = eslDSQ_EOD; } /* header_genbank() * * sqfp->buf is the first (LOCUS) line of the entry, or a line before * it (in which case we'll scan forwards to find the LOCUS line - even * skipping non-blank lines, because there are sometimes headers at * the start of GenBank files). * * On success, returns and: * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc) * sq->acc contains seq accession (and may have been reallocated, changing sq->aalloc) * sq->desc contains description line (and may have been reallocated, changing sq->dalloc) * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf is the first seq line. * * If no more seqs are found in the file, returns . * On parse failure, returns , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq) { char *s; char *tok; int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* Find LOCUS line, allowing for ignoration of a file header. */ if (ascii->nc == 0) return eslEOF; while (strncmp(ascii->buf, "LOCUS ", 8) != 0) { if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */ else if (status != eslOK) return status; /* abnormal */ } s = ascii->buf+12; if ((status = esl_strtok(&s, " ", &tok)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on LOCUS line", ascii->linenumber); if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status; sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */ /* Look for ORIGIN line, parsing optional info as we go. */ do { if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line"); /* Optional VERSION line is parsed as "accession". */ if (strncmp(ascii->buf, "VERSION ", 10) == 0) { s = ascii->buf+12; if ((status = esl_strtok(&s, " \t\n", &tok)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse VERSION line", ascii->linenumber); if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status; } /* Optional DEFINITION Line is parsed as "description". */ if (strncmp(ascii->buf, "DEFINITION ", 11) == 0) { s = ascii->buf+12; esl_strchop(s, ascii->nc-12); if ((status = esl_sq_AppendDesc(sq, s)) != eslOK) ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse desc on DEFINITION line", ascii->linenumber); } } while (strncmp(ascii->buf, "ORIGIN", 6) != 0); if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence"); sq->hoff = ascii->boff - 1; sq->doff = ascii->boff; return eslOK; } /* skip_genbank() * * Skip past the GenBank header and position to start of the sequence line. * * On success, returns and: * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf is the first seq line. * * If no more seqs are found in the file, returns . * On parse failure, returns , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int skip_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* Find LOCUS line, allowing for ignoration of a file header. */ if (ascii->nc == 0) return eslEOF; while (strncmp(ascii->buf, "LOCUS ", 8) != 0) { if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */ else if (status != eslOK) return status; /* abnormal */ } sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */ /* zero out the name, accession and description */ sq->name[0] = '\0'; sq->acc[0] = '\0'; sq->desc[0] = '\0'; /* Look for ORIGIN line, parsing optional info as we go. */ do { if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line"); } while (strncmp(ascii->buf, "ORIGIN", 6) != 0); if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence"); sq->hoff = ascii->boff - 1; sq->doff = ascii->boff; return eslOK; } static int end_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq) { int status; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber); sq->eoff = ascii->boff + ascii->nc - 1; status = loadbuf(sqfp); if (status == eslEOF) return eslOK; /* ok, actually; we'll detect EOF on next sq read */ else if (status == eslOK) return eslOK; else return status; } /*----------------- end GenBank format -------------------------------*/ /***************************************************************** *# 9. Internal routines for FASTA format *****************************************************************/ static void config_fasta(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; ascii->is_linebased = FALSE; ascii->eof_is_ok = TRUE; ascii->parse_header = &header_fasta; ascii->skip_header = &skip_fasta; ascii->parse_end = &end_fasta; } static void inmap_fasta(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap) { int x; if (abc_inmap != NULL) { for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x]; sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format } else { for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL; for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x; for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x; } sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */ sqfp->inmap[' '] = eslDSQ_IGNORED; sqfp->inmap['\t'] = eslDSQ_IGNORED; sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */ sqfp->inmap['\n'] = eslDSQ_EOL; sqfp->inmap['>'] = eslDSQ_EOD; /* \n is special - fasta reader detects it as an eol */ } /* header_fasta() * * sqfp->buf[sqfp->bpos] is sitting at the start of a FASTA record, or * at a space before it (in which case we'll advance, skipping whitespace, * until a > is reached). * Parse the header line, storing name and description in . * * On success, returns and: * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc) * sq->desc contains description line (and may have been reallocated, changing sq->dalloc) * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf[sqfp->bpos] is sitting at the start of the seq line. * sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths * * If no more seqs are found in the file, returns . * On parse failure, return , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq) { char c; int status = eslOK; void *tmp; int pos; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; /* make sure there are characters in the buffer */ if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status; c = ascii->buf[ascii->bpos]; while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */ if (status == eslEOF) return eslEOF; if (status == eslOK && c == '>') { /* accept the > */ sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */ status = nextchar(sqfp, &c); } else if (c != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expected FASTA to start with >", ascii->linenumber, c); while (status == eslOK && (c == '\t' || c == ' ')) status = nextchar(sqfp, &c); /* skip space */ /* Store the name (space delimited) */ pos = 0; while (status == eslOK && ! isspace(c)) { sq->name[pos++] = c; if (pos == sq->nalloc-1) { ESL_RALLOC(sq->name, tmp, sq->nalloc*2); sq->nalloc*=2; } status = nextchar(sqfp, &c); } if (pos == 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": no FASTA name found", ascii->linenumber); sq->name[pos] = '\0'; while (status == eslOK && (c == '\t' || c == ' ')) status = nextchar(sqfp, &c); /* skip space */ /* Store the description (end-of-line delimited) */ /* Patched to deal with NCBI NR desclines: delimit by ctrl-A (0x01) too. [SRE:H1/82] */ pos = 0; while (status == eslOK && c != '\n' && c != '\r' && c != 1) { sq->desc[pos++] = c; if (pos == sq->dalloc-1) { ESL_RALLOC(sq->desc, tmp, sq->dalloc*2); sq->dalloc*= 2; } status = nextchar(sqfp, &c); } sq->desc[pos] = '\0'; /* Because of the NCBI NR patch, c might be0x01 ctrl-A now; skip to eol. * (TODO: I'm worried about the efficiency of this nextchar() stuff. Revisit.) */ while (status == eslOK && c != '\n' && c != '\r') status = nextchar(sqfp, &c); sq->hoff = ascii->boff + ascii->bpos; while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c); /* skip past eol (DOS \r\n, MAC \r, UNIX \n */ if (status != eslOK && status != eslEOF) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected failure in parsing FASTA name/description line"); /* Edge case: if the last sequence in the file is L=0, no residues, we are EOF now, not OK; but we'll return OK because we parsed the header line */ sq->doff = ascii->boff + ascii->bpos; ascii->prvrpl = ascii->prvbpl = -1; ascii->currpl = ascii->curbpl = 0; ascii->linenumber++; return eslOK; ERROR: return status;/* eslEMEM, from failed realloc */ } /* skip_fasta() * * Skip past the fasta header and position to start of the sequence line. * * On success, returns and: * sq->roff has been set to the record offset * sq->doff has been set to the data offset (start of sequence line) * sqfp->buf[sqfp->bpos] is sitting at the start of the seq line. * sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths * * If no more seqs are found in the file, returns . * On parse failure, return , leaves as mesg in ascii->errbuf. * * May also throw on allocation errors. */ static int skip_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq) { char c; int status = eslOK; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; c = ascii->buf[ascii->bpos]; while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */ if (status == eslEOF) return eslEOF; if (status != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected parsing error %d", status); if (c != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expecting '>'", ascii->linenumber, c); sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */ /* zero out the name, accession and description */ sq->name[0] = '\0'; sq->acc[0] = '\0'; sq->desc[0] = '\0'; status = nextchar(sqfp, &c); /* skip to end of line */ while (status == eslOK && c != '\n' && c != '\r') status = nextchar(sqfp, &c); sq->doff = ascii->boff + ascii->bpos; /* skip past end of line */ while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c); if (status != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF in parsing FASTA name/description line"); sq->doff = ascii->boff + ascii->bpos; ascii->linenumber++; return eslOK; } static int end_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->bpos < ascii->nc) { if (ascii->buf[ascii->bpos] != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, FASTA reader is corrupted"); sq->eoff = ascii->boff + ascii->bpos - 1; /* this puts eoff at the last \n */ } /* else, EOF, and we don't have to do anything. */ return eslOK; } /* Function: esl_sqascii_WriteFasta() * Synopsis: Write a sequence in FASTA foramt * * Purpose: Write sequence in FASTA format to the open stream . * * If is TRUE, then store record, data, and end * offsets in ; this ability is used by unit tests. * * Returns: on success. * * Throws: on allocation failure. * on system write error. */ int esl_sqascii_WriteFasta(FILE *fp, ESL_SQ *sq, int save_offsets) { char buf[61]; int64_t pos; if (save_offsets) sq->roff = ftello(fp); if (fprintf(fp, ">%s", sq->name) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed"); if (sq->acc[0] != 0 && fprintf(fp, " %s", sq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed"); if (sq->desc[0] != 0 && fprintf(fp, " %s", sq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed"); if (save_offsets) sq->hoff = ftello(fp); if (fputc('\n', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed"); buf[60] = '\0'; if (save_offsets) sq->doff = ftello(fp); for (pos = 0; pos < sq->n; pos += 60) { if (sq->dsq != NULL) esl_abc_TextizeN(sq->abc, sq->dsq+pos+1, 60, buf); else strncpy(buf, sq->seq+pos, 60); if (fprintf(fp, "%s\n", buf) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed"); } if (save_offsets) sq->eoff = ftello(fp) - 1; return eslOK; } /*------------------- end of FASTA i/o ---------------------------*/ /***************************************************************** *# 10. Internal routines for daemon format *****************************************************************/ /* Special case FASTA format where each sequence is terminated with "//". * * The use case is where the sequences are being read from a pipe and a * way is needed to signal the end of the sequence so it can be processed. * The next sequence might not be in the pipe, so the usual '>' is not * present to signal the end of the sequence. Also, an EOF is not * an option, since the daemon might run continuously. */ static void config_daemon(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; ascii->is_linebased = FALSE; ascii->eof_is_ok = FALSE; ascii->parse_header = &header_fasta; ascii->skip_header = &skip_fasta; ascii->parse_end = &end_daemon; } static void inmap_daemon(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap) { int x; if (abc_inmap != NULL) { for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x]; sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format } else { for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL; for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x; for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x; } sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */ sqfp->inmap[' '] = eslDSQ_IGNORED; sqfp->inmap['\t'] = eslDSQ_IGNORED; sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */ sqfp->inmap['\n'] = eslDSQ_EOL; sqfp->inmap['/'] = eslDSQ_EOD; /* \n is special - fasta reader detects it as an eol */ } /* end_daemon() * * Special case FASTA format where each sequence is terminated with "//". * * The use case is were the sequences are being read from a pipe and a * way is needed to signal the end of the sequence so it can be processed. */ static int end_daemon(ESL_SQFILE *sqfp, ESL_SQ *sq) { char c; ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; if (ascii->nc < 3) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, daemon input stream is corrupted"); c = ascii->buf[ascii->bpos++]; if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber); c = ascii->buf[ascii->bpos++]; if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber); /* skip to end of line */ while (c != '\n' && c != '\r' && ascii->bpos < ascii->nc) c = ascii->buf[ascii->bpos++]; /* skip past end of line */ while ((c == '\n' || c == '\r') && ascii->bpos < ascii->nc) c = ascii->buf[ascii->bpos++]; return eslOK; } /* esl_sqascii_Parse() * * Parse a sequence already read into a buffer. */ int esl_sqascii_Parse(char *buf, int size, ESL_SQ *sq, int format) { int status; int64_t epos; int64_t n; ESL_SQFILE sqfp; ESL_SQASCII_DATA *ascii = &sqfp.data.ascii; /* fill in a dummy esl_sqfile structure used to parse buf */ ascii->fp = NULL; ascii->do_gzip = FALSE; ascii->do_stdin = FALSE; ascii->do_buffer = TRUE; ascii->mem = buf; ascii->allocm = 0; ascii->mn = size; ascii->mpos = 0; ascii->moff = -1; ascii->is_recording = FALSE; ascii->buf = NULL; ascii->boff = 0; ascii->balloc = 0; ascii->nc = 0; ascii->bpos = 0; ascii->L = 0; ascii->linenumber = 1; ascii->afp = NULL; ascii->msa = NULL; ascii->idx = -1; ascii->ssifile = NULL; ascii->rpl = -1;/* -1 = not set yet */ ascii->bpl = -1;/* (ditto) */ ascii->prvrpl = -1;/* (ditto) */ ascii->prvbpl = -1;/* (ditto) */ ascii->currpl = -1; ascii->curbpl = -1; ascii->ssi = NULL; /* Configure the 's parser and inmaps for this format. */ switch (format) { case eslSQFILE_EMBL: case eslSQFILE_UNIPROT: config_embl(&sqfp); inmap_embl(&sqfp, NULL); break; case eslSQFILE_GENBANK: case eslSQFILE_DDBJ: config_genbank(&sqfp); inmap_genbank(&sqfp, NULL); break; case eslSQFILE_FASTA: config_fasta(&sqfp); inmap_fasta(&sqfp, NULL); break; case eslSQFILE_DAEMON: config_daemon(&sqfp); inmap_daemon(&sqfp, NULL); break; default: return eslEFORMAT; } /* Main case: read next seq from sqfp's stream */ if ((status = ascii->parse_header(&sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */ do { if ((status = seebuf(&sqfp, -1, &n, &epos)) == eslEFORMAT) return status; if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM; addbuf(&sqfp, sq, n); ascii->L += n; sq->eoff = ascii->boff + epos - 1; if (status == eslEOD) break; } while ((status = loadbuf(&sqfp)) == eslOK); if (status == eslEOF) { if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?"); if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status; } else if (status == eslEOD) { ascii->bpos = epos; if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status; } else if (status != eslOK) return status; if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL; else sq->seq[sq->n] = '\0'; sq->start = 1; sq->end = sq->n; sq->C = 0; sq->W = sq->n; sq->L = sq->n; if (ascii->balloc > 0) free(ascii->buf); return eslOK; } /*-------------------- end of daemon ----------------------------*/ /***************************************************************** *# 11. Internal routines for HMMPGMD format *****************************************************************/ static int fileheader_hmmpgmd(ESL_SQFILE *sqfp) { ESL_SQASCII_DATA *ascii = &sqfp->data.ascii; char c; int status = eslOK; /* We've just loaded first buffer, after an Open. First char should be the # of the hmmpgmd file, * but let's tolerate leading whitespace anyway */ c = ascii->buf[ascii->bpos]; while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n, \r) */ if (status == eslEOF) return eslEOF; if (c != '#') ESL_FAIL(eslEFORMAT, ascii->errbuf, "hmmpgmd file expected to start with #"); /* skip first line; remainder of file is FASTA format */ while (status == eslOK && (c != '\n' && c != '\r')) status = nextchar(sqfp, &c); if (status == eslEOF) return eslEOF; /* next character read should be the '>' of the first FASTA record. We're properly positioned at "start of file". */ return eslOK; } /*-------------------- end of HMMPGMD ---------------------------*/