/* Multiple sequence alignment file i/o * * See also: esl_msafile2.[ch], which contains a legacy ESL_MSAFILE2 interface * that includes support for --small option in various tools. */ #ifndef eslMSAFILE_INCLUDED #define eslMSAFILE_INCLUDED #include "esl_config.h" #include #include "esl_alphabet.h" /* digital alphabets */ #include "esl_buffer.h" /* string hashes, for mapping uniq seq names */ #include "esl_msa.h" /* ESL_MSA structure */ #include "esl_ssi.h" /* indexes of large flatfiles on disk */ /* Object: ESL_MSAFILE_FMTDATA * * Additional (often optional) information about variants of some file * formats. Not much in here right now - but figured this might need * to expand in the future, best to have the mechanism in place. * * Used in three ways: * 1. When opening an MSA file in a known format (as opposed to * guessing an unknown format), caller may provide an * structure containing any additional constraints on the format. * The new will copy this information into fmtd>. * 2. When opening an MSA file in an unknown format (calling GuessFileFormat()), * format-specific autodetectors fill in fmtd> with any additional * constraints. * 3. When writing an MSA file, caller may provide additional constraints on * the format; notably rpl>, the number of residues per line, * used for many formats. * * TODO: If this fills up with more information, we should eventually * consolidate the format code too; create ESL_MSAFORMAT structure * to hold both integer code and optional information; implement * it in esl_msaformat.[ch]; put format guessing routines there; * rename eslMSAFILE_* -> eslMSAFORMAT_*. For now, not worth the * time, because it's really only a placeholder dealing with a small * PHYLIP-specific format issue. , are generally * an ordered pair, to facilitate eventual replacement w/ single * . [SRE, 19 Jul 11] */ typedef struct { int namewidth; /* PHYLIP only: width of the name field (usually 10, but can vary) unset=0 */ int rpl; /* several formats: residues per line unset=0 */ } ESL_MSAFILE_FMTDATA; /* Object: ESL_MSAFILE * * An alignment file open for parsing. */ typedef struct { ESL_BUFFER *bf; /* input file/data being parsed */ int32_t format; /* format of alignment file we're reading */ ESL_MSAFILE_FMTDATA fmtd; /* additional (often optional) format-specific details. */ char *line; /* line read from by */ esl_pos_t n; /* length of line in bytes (line is not NUL-terminated) */ int64_t linenumber; /* input linenumber for diagnostics; -1 if we lose track */ esl_pos_t lineoffset; /* offset of start of in ; -1 if line unset */ ESL_DSQ inmap[128]; /* input map, 0..127 */ const ESL_ALPHABET *abc; /* non-NULL if in digital mode */ ESL_SSI *ssi; /* open SSI index; or NULL if none */ char errmsg[eslERRBUFSIZE]; /* user-directed message for normal errors */ } ESL_MSAFILE; /* Alignment file format codes. * Must coexist with sqio unaligned file format codes. * Rules: * - 0 is an unknown/unassigned format * - <=100 reserved for unaligned formats * - >100 reserved for aligned formats */ #define eslMSAFILE_UNKNOWN 0 /* unknown format */ #define eslMSAFILE_STOCKHOLM 101 /* Stockholm format, interleaved */ #define eslMSAFILE_PFAM 102 /* Pfam/Rfam one-line-per-seq Stockholm format */ #define eslMSAFILE_A2M 103 /* UCSC SAM's fasta-like a2m format */ #define eslMSAFILE_PSIBLAST 104 /* NCBI PSI-BLAST alignment format */ #define eslMSAFILE_SELEX 105 /* old SELEX format (largely obsolete) */ #define eslMSAFILE_AFA 106 /* aligned FASTA format */ #define eslMSAFILE_CLUSTAL 107 /* CLUSTAL format */ #define eslMSAFILE_CLUSTALLIKE 108 /* CLUSTAL-like formats (MUSCLE, PROBCONS) */ #define eslMSAFILE_PHYLIP 109 /* interleaved PHYLIP format */ #define eslMSAFILE_PHYLIPS 110 /* sequential PHYLIP format */ /* 1. Opening/closing an ESL_MSAFILE */ extern int esl_msafile_Open (ESL_ALPHABET **byp_abc, const char *msafile, const char *env, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp); extern int esl_msafile_OpenMem (ESL_ALPHABET **byp_abc, const char *p, esl_pos_t n, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp); extern int esl_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp); extern void esl_msafile_OpenFailure(ESL_MSAFILE *afp, int status); extern int esl_msafile_SetDigital (ESL_MSAFILE *afp, const ESL_ALPHABET *abc); extern void esl_msafile_Close(ESL_MSAFILE *afp); /* 2. ESL_MSAFILE_FMTDATA: optional extra constraints on formats */ extern int esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA *fmtd); extern int esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA *src, ESL_MSAFILE_FMTDATA *dst); /* 3. Utilities for different file formats */ extern int esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDATA *fmtd, char *errbuf); extern int esl_msafile_IsMultiRecord(int fmt); extern int esl_msafile_EncodeFormat(char *fmtstring); extern char *esl_msafile_DecodeFormat(int fmt); /* 4. Utilities for different alphabets */ extern int esl_msafile_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type); /* 5. Random access in a MSA flatfile database */ extern int esl_msafile_PositionByKey(ESL_MSAFILE *afp, const char *key); /* 6. Reading an MSA from an ESL_MSAFILE */ extern int esl_msafile_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa); extern void esl_msafile_ReadFailure(ESL_MSAFILE *afp, int status); /* 7. Writing an MSA to a stream */ extern int esl_msafile_Write(FILE *fp, ESL_MSA *msa, int fmt); /* 8. Utilities for specific parsers */ extern int esl_msafile_GetLine(ESL_MSAFILE *afp, char **opt_p, esl_pos_t *opt_n); extern int esl_msafile_PutLine(ESL_MSAFILE *afp); #include "esl_msafile_a2m.h" #include "esl_msafile_afa.h" #include "esl_msafile_clustal.h" #include "esl_msafile_phylip.h" #include "esl_msafile_psiblast.h" #include "esl_msafile_selex.h" #include "esl_msafile_stockholm.h" #endif /*eslMSAFILE_INCLUDED*/