/* A simple token-based file parsing system. * * Contents: * 1. The ESL_FILEPARSER object and its API. * 2. Private functions. * 3. Unit tests. * 4. Test driver. * 5. Examples. */ #include #include #include #include #include #include "easel.h" #include "esl_fileparser.h" static int nextline(ESL_FILEPARSER *efp); /***************************************************************** * 1. The ESL_FILEPARSER object and its API. *****************************************************************/ /* Function: esl_fileparser_Open() * Incept: SRE, Tue Apr 3 08:09:56 2007 [Janelia] * * Purpose: Opens for reading. * * As a special case, if is "-", set up the * fileparser to read and parse . * * is optional name of an environment variable, * such as . This environment variable contains a * colon-delimited list of directories in which the * may lie relative to. We looks first relative * to the current working directory, then in any * directories specified by . If is , * we only look in the current working directory. * * Args: filename - filename, relative path, or fully qualified path * envvar - optional environment variable name to find * colon-delimited list of directories * may reside in; or * ret_efp - RETURN: opened * * Returns: on success, and points * to a new object. * * Returns if can't * be opened for reading, and is set * to . * * Throws: on allocation failure. */ int esl_fileparser_Open(const char *filename, const char *envvar, ESL_FILEPARSER **ret_efp) { int status; ESL_FILEPARSER *efp = NULL; if ((efp = esl_fileparser_Create(NULL)) == NULL) { status = eslEMEM; goto ERROR; } if (strcmp(filename, "-") == 0) efp->fp = stdin; else if ((efp->fp = fopen(filename, "r")) != NULL) { if ((status = esl_strdup(filename, -1, &(efp->filename))) != eslOK) goto ERROR; } else if ((status = esl_FileEnvOpen(filename, envvar, &(efp->fp), &(efp->filename))) != eslOK) { status = eslENOTFOUND; goto ERROR; } *ret_efp = efp; return eslOK; ERROR: esl_fileparser_Close(efp); *ret_efp = NULL; return status; } /* Function: esl_fileparser_Create() * Incept: SRE, Fri Jul 9 12:50:29 2004 [St. Louis] * * Purpose: Take an open file , and transform it to * a fileparser object -- preparing to parse it * one whitespace-delimited field at a time. * * Args: fp - open FILE to parse * * Returns: a new object, which must be * free'd by the caller with . * * Throws: if an allocation failed. * * Xref: STL8 p.56. */ ESL_FILEPARSER * esl_fileparser_Create(FILE *fp) { int status; ESL_FILEPARSER *efp = NULL; ESL_ALLOC(efp, sizeof(ESL_FILEPARSER)); efp->fp = fp; efp->buf = NULL; efp->buflen = 0; efp->s = NULL; efp->commentchar = '\0'; efp->filename = NULL; efp->linenumber = 0; efp->errbuf[0] = '\0'; efp->is_buffer = FALSE; efp->mem_buffer = NULL; efp->mem_size = 0; efp->mem_pos = 0; return efp; ERROR: esl_fileparser_Destroy(efp); return NULL; } /* Function: esl_fileparser_CreateMapped() * Incept: MSF, Mon Aug 16 2010 [Janelia] * * Purpose: Sets up a memory buffer to be parsed with the file parser * routines. * * Args: fp - open FILE to parse * * Returns: a new object, which must be * free'd by the caller with . * * Throws: if an allocation failed. * * Xref: STL8 p.56. */ ESL_FILEPARSER * esl_fileparser_CreateMapped(const void *buffer, int size) { ESL_FILEPARSER *efp = NULL; if ((efp = esl_fileparser_Create(NULL)) == NULL) return NULL; efp->is_buffer = TRUE; efp->mem_buffer = buffer; efp->mem_size = size; return efp; } /* Function: esl_fileparser_SetCommentChar() * * Purpose: Defines a single character for comments. Anything * on a line following this character is ignored * when parsing. * * Args: efp - open fileparser * c - comment character ('#', for example) * * Returns: on success. */ int esl_fileparser_SetCommentChar(ESL_FILEPARSER *efp, char c) { efp->commentchar = c; return eslOK; } /* Function: esl_fileparser_GetToken() * Incept: SRE, Fri Jul 9 13:03:50 2004 [St. Louis] * * Purpose: Sets a pointer to the next field in the * file we're parsing. * * The pointer is into an internal line buffer * that may be invalidated upon the next call to a * function. If you want to store it, make a * copy. * * Args: efp - open fileparser * opt_tok - optRETURN: ptr to next field * opt_toklen - optRETURN: length of tok. * * Returns: if , contain valid data. * on normal end-of-file. * * Throws: if an allocation fails. * * Xref: STL8 p.56. */ int esl_fileparser_GetToken(ESL_FILEPARSER *efp, char **opt_tok, int *opt_toklen) { char *tok = NULL; int toklen = 0; int tokcode; int fcode; int goodtok; if (opt_tok) *opt_tok = NULL; if (opt_toklen) *opt_toklen = 0; /* First, make sure we have a line loaded. * On the first call to GetToken, we won't. */ if (efp->buf == NULL) { fcode = nextline(efp); if (fcode != eslOK) return fcode; } /* Start strtok()'ing this line to try to find token. * If we don't find one, keep loading lines until we * do, or we run out of data. * efp->s was set either by nextline() or previous GetToken(). */ do { goodtok = FALSE; tokcode = esl_strtok_adv(&(efp->s), " \t\r\n", &tok, &toklen, NULL); if (tokcode == eslEOL || (tokcode == eslOK && *tok == efp->commentchar)) { fcode = nextline(efp); if (fcode != eslOK) return fcode; } else if (tokcode == eslOK) goodtok = TRUE; else ESL_FAIL(tokcode, efp->errbuf, "esl_strtok() failed"); } while (! goodtok); if (opt_tok) *opt_tok = tok; if (opt_toklen) *opt_toklen = toklen; return eslOK; } /* Function: esl_fileparser_NextLine() * Incept: SRE, Tue Apr 3 08:27:22 2007 [Janelia] * * Purpose: Advance the parser to the next non-blank, non-comment * data line that contains at least one token. * * Upon return, buf> is a data-containing line, and * s> points to the first non-whitespace character on * it. A line-based parser can work on one or both of these. * * A line-oriented but token-based parser will call * to extract successive * tokens from it. * * A pure token-based parser will generally not call * <_NextLine()>. The only reason would be to skip the * remainder of a line it's in the middle of parsing, and * advance to the next one -- but that's a sort of * line-oriented thing to do. * * Returns: on success. * if no more data lines remain in the file. * * Throws: on allocation error. */ int esl_fileparser_NextLine(ESL_FILEPARSER *efp) { int status; while ((status = nextline(efp)) == eslOK) { while (*(efp->s) != '\0' && isspace(*(efp->s))) efp->s++; if (*(efp->s) != '\0' && *efp->s != efp->commentchar) break; } if (status == eslEOF) return status; if (status != eslOK) ESL_FAIL(status, efp->errbuf, "nextline() failed"); return eslOK; } /* Function: esl_fileparser_NextLinePeeked() * Synopsis: Read the next line, prepending a peek. * Incept: SRE, Wed Oct 15 10:08:37 2008 [Janelia] * * Purpose: Sometimes we need to peek at the start of an input stream * to see whether it is in a binary format, before we start * parsing it as ASCII lines. When this happens, the caller * will typically have used to read a fixed * number of bytes from the input stream, checked to see if * they are a magic number representing a binary format, * and found that they are not. The caller then wants to * switch to reading in ASCII format with the * API, but with those bytes included on the first * line. Because the file might start with comments or * blank lines that need to be skipped, we want to deal * with the peeked data in the context of the * . The caller cannot simply close and * reopen the stream, because the stream may be a pipe * ( or for example). * * The caller passes the bytes it peeked at with * in , and the number of bytes it peeked at in * . * * The parser is advanced to the next non-blank, * non-comment data line that contains at least one token, * taking the prepended into account. * * There is a significant flaw in this mechanism, and as a * result the caller must be able to guarantee the * following limitation. The first data-containing line * must be longer than . It is sufficient for the * first data token to be longer than . * (Equivalently, if contains any data token, it * must not contain any newline \verb+\n+ after that data.) The * reason is that we need to avoid a situation where the * concatenated prefix+nextline contains more than one data * line, because other routines in the module assume that * buf> is a single \verb+\n+-terminated line of input. For * example: HMMER save files either start with a 4-byte * binary magic number, or with "HMMER", and "HMMER" is * longer than 4 bytes. * * Args: efp - open fileparser * prefix - bytes that caller obtained by peeking with fread() * plen - number of bytes in prefix * * Returns: on success. * if no more tokens remain in the file. * * Throws: on allocation error. * * Xref: For an example, see HMMER's HMM save file input. */ int esl_fileparser_NextLinePeeked(ESL_FILEPARSER *efp, char *prefix, int plen) { int blen; int status; /* First, make buf = the first line again, by prepending . */ if ((status = nextline(efp)) != eslOK) goto ERROR; /* EOF, EMEM */ blen = strlen(efp->buf); if (blen + plen + 1 > efp->buflen) { ESL_REALLOC(efp->buf, sizeof(char) * (blen + plen + 1)); efp->buflen = blen + plen + 1; } memmove(efp->buf+plen, efp->buf, blen+1); memcpy(efp->buf, prefix, plen); efp->s = efp->buf; while (*(efp->s) != '\0' && isspace(*(efp->s))) efp->s++; if (*(efp->s) != '\0' && *efp->s != efp->commentchar) return eslOK; else return esl_fileparser_NextLine(efp); ERROR: return status; } /* Function: esl_fileparser_GetTokenOnLine() * Incept: SRE, Tue Apr 3 08:46:59 2007 [Janelia] * * Purpose: Same as , except that it only * retrieves tokens from the line that the parser is * on. When it runs out of tokens on the line, it returns * . This allows a caller to count the tokens on a * line (whereas reads through newlines * silently). * * The pointer is into an internal line buffer * that may be invalidated upon the next call to a * function. If you want to store it, make a * copy. * * Normally, a call to * would be preceded by to * position the parser on the next data line with at least * one token on it. However, you could also conceivably * call after one or more * calls to , to get remaining * tokens from a given line. What you can't do is to call * immediately after * opening a file; the parser won't have a line loaded yet. * (In this case, it would return .) * * Returns: on success, and the token and its length are * in and . * * Returns if no more tokens exist on the line; * in this case is set to and * to 0. */ int esl_fileparser_GetTokenOnLine(ESL_FILEPARSER *efp, char **opt_tok, int *opt_toklen) { char *tok = NULL; int toklen = 0; int status; /* No line loaded? Then we can't find any token on it. */ if (efp->buf == NULL) { status = eslEOL; goto ERROR; } /* Find next token in the line that's already loaded in the parser. */ status = esl_strtok_adv(&(efp->s), " \t\r\n", &tok, &toklen, NULL); if (status == eslEOL) goto ERROR; if (status != eslOK) goto ERROR; if (status == eslOK && *tok == efp->commentchar) { status = eslEOL; goto ERROR; } if (opt_tok) *opt_tok = tok; if (opt_toklen) *opt_toklen = toklen; return eslOK; ERROR: if (opt_tok) *opt_tok = NULL; if (opt_toklen) *opt_toklen = 0; return status; } /* Function: esl_fileparser_GetRemainingLine() * Synopsis: Returns pointer to the rest of the current line. * Incept: SRE, Mon Oct 13 08:59:26 2008 [Janelia] * * Purpose: Set a pointer <*ret_s> to the rest of the current line * held by the fileparser . Trailing newline char, * if any, is removed. * * Because points to internal storage in the * fileparser, the caller should be finished with it before * making its next call to any fileparser function. * * Any comment characters on the rest of the line are * ignored: this is designed for a case where the rest of * the line is to be read as free text. * * Args: efp - fileparser * ret_s - RETURN: pointer to the remainder of the line * * Returns: on success. * if nothing remains on the line, and <*ret_s> * is . * * Throws: (no abnormal error conditions) */ int esl_fileparser_GetRemainingLine(ESL_FILEPARSER *efp, char **ret_s) { /* No line loaded? Then we can't find anything on it. */ if (efp->buf == NULL) { *ret_s = NULL; return eslEOL; } /* skip leading whitespace */ while (isspace(*(efp->s))) efp->s++; /* Return everything to end of line as a "token", stripping newline */ return esl_strtok(&(efp->s), "\r\n", ret_s); } /* Function: esl_fileparser_Destroy() * Incept: SRE, Fri Jul 9 13:22:36 2004 [St. Louis] * * Purpose: Frees an open . The original fp is * still open - whoever opened it is still * responsible for closing it. * * Xref: STL8 p.56. */ void esl_fileparser_Destroy(ESL_FILEPARSER *efp) { if (efp) { if (efp->buf != NULL) free(efp->buf); free(efp); } } /* Function: esl_fileparser_Close() * Incept: SRE, Tue Apr 3 08:18:11 2007 [Janelia] * * Purpose: Closes an open , including the * file it opened. */ void esl_fileparser_Close(ESL_FILEPARSER *efp) { if (efp == NULL) return; if (efp->fp != NULL && efp->fp != stdin) fclose(efp->fp); if (efp->filename != NULL) free(efp->filename); esl_fileparser_Destroy(efp); } /***************************************************************** * 2. Private functions *****************************************************************/ /* nextline() * * Purpose: Skip the file parser to the next line (for instance, * if an end-of-line comment is found). The new line might * have no tokens on it. * * Args: efp - open file parser * * Returns: eslOK: success * eslEOF: normal end of file * * Throws: if a reallocation failed in fgets() * * Xref: STL8 p.56 */ static int nextline(ESL_FILEPARSER *efp) { int status; /* check if we are reading from a file or a buffer */ if (efp->is_buffer) { int len; int end; const char *ptr; if (efp->mem_pos >= efp->mem_size) return eslEOF; len = 0; end = efp->mem_size - efp->mem_pos; ptr = efp->mem_buffer + efp->mem_pos; while (len < end && *ptr++ != '\n') ++len; if (len < end) ++len; if (len + 1 > efp->buflen) { ESL_REALLOC(efp->buf, ESL_MAX(64, len * 2)); efp->buflen = ESL_MAX(64, len * 2); } memcpy(efp->buf, efp->mem_buffer + efp->mem_pos, len); efp->buf[len] = 0; efp->mem_pos += len; } else { if ((status = esl_fgets(&(efp->buf), &(efp->buflen), efp->fp)) != eslOK) ESL_FAIL(status, efp->errbuf, "esl_fgets() failed"); } efp->s = efp->buf; efp->linenumber++; return eslOK; ERROR: return status; } /***************************************************************** * 3. Unit tests. *****************************************************************/ #ifdef eslFILEPARSER_TESTDRIVE /* test the interface for getting all tokens in a file, regardless * of newlines. Also, uses the Create/Destroy interface instead of * Open/Close. */ static void utest_GetToken(char *filename) { int status; ESL_FILEPARSER *efp = NULL; FILE *fp = NULL; char *tok = NULL; int toklen = 0; int ntok = 0; if ((fp = fopen(filename, "r")) == NULL) esl_fatal("File open failed"); if ((efp = esl_fileparser_Create(fp)) == NULL) esl_fatal("Failed to associate stream with fileparser"); esl_fileparser_SetCommentChar(efp, '#'); while ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) == eslOK) { if (toklen != 6) esl_fatal("bad token %s", tok); if (strncmp(tok, "token", 5) != 0) esl_fatal("bad token %s", tok); ntok++; } if (status != eslEOF) esl_fatal("Abnormal parse termination"); if (ntok != 5) esl_fatal("bad total token number %d\n", ntok); esl_fileparser_Destroy(efp); fclose(fp); return; } /* test the NextLine and GetTokenOnLine interface, as well as the * Open/Close interface. */ static void utest_GetTokenOnLine(char *filename) { int status; ESL_FILEPARSER *efp = NULL; char *tok = NULL; int toklen = 0; int ntok = 0; int nlines = 0; char expect[32]; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { nlines++; while ((status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen)) == eslOK) { ntok++; snprintf(expect, 32, "token%d", ntok); // 32 is from the static allocation of expect[32] if (toklen != 6) esl_fatal("bad token length for %s", tok); if (strcmp(expect, tok) != 0) esl_fatal("bad token %s", tok); } if (status != eslEOL) esl_fatal("Unexpected code in place of end-of-line"); } if (status != eslEOF) esl_fatal("Unexpected code in place of end-of-file."); if (nlines != 3) esl_fatal("expected to parse 3 lines; parsed %d", nlines); if (ntok != 5) esl_fatal("expected to parse 5 tokens; parsed %d", ntok); esl_fileparser_Close(efp); return; } static void utest_GetTokenBuffered(char *buffer) { int status; ESL_FILEPARSER *efp = NULL; char *tok = NULL; int toklen = 0; int ntok = 0; if ((efp = esl_fileparser_CreateMapped(buffer, strlen(buffer))) == NULL) esl_fatal("Failed to associate buffer with fileparser"); esl_fileparser_SetCommentChar(efp, '#'); while ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) == eslOK) { if (toklen != 6) esl_fatal("bad token %s", tok); if (strncmp(tok, "token", 5) != 0) esl_fatal("bad token %s", tok); ntok++; } if (status != eslEOF) esl_fatal("Abnormal parse termination"); if (ntok != 5) esl_fatal("bad total token number %d\n", ntok); esl_fileparser_Destroy(efp); return; } #endif /*eslFILEPARSER_TESTDRIVE*/ /***************************************************************** * 4. Test driver. *****************************************************************/ /* gcc -g -Wall -I. -o test -DeslFILEPARSER_TESTDRIVE esl_fileparser.c easel.c ./test */ #ifdef eslFILEPARSER_TESTDRIVE #include #include #include "easel.h" #include "esl_fileparser.h" int main(int argc, char **argv) { char tmpfile[32] = "esltmpXXXXXX"; FILE *fp; char stream[] = "# Full line comment\n" "token1 # Trailing comment\n" "\n" /* blank line */ " \n" /* whitespace line */ " # sowing comment/whitespace confusion...\n" "token2\ttoken3 token4\n" "token5"; /* file ends w/ no \n */ /* Create a test file to read. */ if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal("File open failed"); fprintf(fp, "%s", stream); fclose(fp); /* Run unit tests using that file. * Unit tests have hardwired knowledge of what's supposed to be in the file. */ utest_GetToken(tmpfile); utest_GetTokenOnLine(tmpfile); utest_GetTokenBuffered(stream); remove(tmpfile); return 0; } #endif /*eslFILEPARSER_TESTDRIVE*/ /***************************************************************** * 5. Examples. *****************************************************************/ /* The first example shows the simplest interface: get all tokens * in the file, one at a time. * gcc -g -Wall -I. -o example -DeslFILEPARSER_EXAMPLE esl_fileparser.c easel.c ./example */ #ifdef eslFILEPARSER_EXAMPLE /*::cexcerpt::fileparser_example::begin::*/ #include #include "easel.h" #include "esl_fileparser.h" int main(int argc, char **argv) { char *filename = argv[1]; int ntok = 1; ESL_FILEPARSER *efp; char *tok; int toklen; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_GetToken(efp, &tok, &toklen) == eslOK) { printf("%5d %3d %s\n", ntok, toklen, tok); ntok++; } esl_fileparser_Close(efp); return 0; } /*::cexcerpt::fileparser_example::end::*/ #endif /*eslFILEPARSER_EXAMPLE*/ /* The second example shows the more line-oriented interface * of NextLine(), GetTokenOnLine(). gcc -g -Wall -I. -o example -DeslFILEPARSER_EXAMPLE2 esl_fileparser.c easel.c ./example */ #ifdef eslFILEPARSER_EXAMPLE2 /*::cexcerpt::fileparser_example2::begin::*/ #include #include "easel.h" #include "esl_fileparser.h" int main(int argc, char **argv) { char *filename = argv[1]; int nline = 1; int ntok; ESL_FILEPARSER *efp; char *tok; int toklen; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { ntok = 0; while (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) == eslOK) ntok++; printf("Line %d in the file (%d non-blank, non-comment) contains %d tokens...\n", efp->linenumber, nline, ntok); nline++; } esl_fileparser_Close(efp); return 0; } /*::cexcerpt::fileparser_example2::end::*/ #endif /*eslFILEPARSER_EXAMPLE*/