syntax = "proto3"; package mehari.txs; // Stores long array of sequences with an "index" of sequence names to their // index. // // The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]` // stores the index into `seqs` for the sequence `aliases[i]`. In other words. // `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`. message SequenceDb { // The sequence aliases, cf. `aliases_idx`. repeated string aliases = 1; // The corresponding index in `seqs`, cf. `aliases`. repeated uint32 aliases_idx = 2; // The corresponding sequences. repeated string seqs = 3; } // Indicates the reference assembly of the transcript database. enum Assembly { // Unknown. ASSEMBLY_UNKNOWN = 0; // GRCh37. ASSEMBLY_GRCH37 = 1; // GRCh38. ASSEMBLY_GRCH38 = 2; } // Indicates the transcript source. enum Source { // Unknown. SOURCE_UNKNOWN = 0; // RefSeq. SOURCE_REFSEQ = 1; // Ensembl. SOURCE_ENSEMBL = 2; } // Version information for the database. message SourceVersion { // Version of mehari used to build the database. string mehari_version = 1; // Assembly used, either GRCh37 or GRCh38 (or Unknown). Assembly assembly = 2; // Version of the assembly, optional. optional string assembly_version = 3; // Source, either RefSeq or Ensembl (or Unknown). Source source_name = 4; // Version of the source, e.g. 112 for Ensembl. string source_version = 5; // Version of cdot. string cdot_version = 6; } // Mapping from gene to transcript ID. message GeneToTxId { // Gene HGNC ID; serves as gene identifier. string gene_id = 1; // Vector of all transcript IDs. repeated string tx_ids = 2; // Whether this gene has been filtered out because of missing transcripts. optional bool filtered = 3; // Reason for filtering. optional uint32 filter_reason = 4; } // Container for the transcript-related database. message TranscriptDb { // Vector of all transcripts. repeated Transcript transcripts = 1; // Mapping from gene ID to vector of all transcript IDs. repeated GeneToTxId gene_to_tx = 2; } // Enumeration for `Transcript::biotype`. enum TranscriptBiotype { // unknown TRANSCRIPT_BIOTYPE_UNKNOWN = 0; // Coding transcript. TRANSCRIPT_BIOTYPE_CODING = 1; // Non-coding transcript. TRANSCRIPT_BIOTYPE_NON_CODING = 2; } // Bit values for the transcript tags. enum TranscriptTag { // unknown TRANSCRIPT_TAG_UNKNOWN = 0; // Member of Ensembl basic. TRANSCRIPT_TAG_BASIC = 1; // Member of Ensembl canonical. TRANSCRIPT_TAG_ENSEMBL_CANONICAL = 2; // Member of MANE Select. TRANSCRIPT_TAG_MANE_SELECT = 3; // Member of MANE Plus Clinical. TRANSCRIPT_TAG_MANE_PLUS_CLINICAL = 4; // Member of RefSeq Select. TRANSCRIPT_TAG_REF_SEQ_SELECT = 5; // Flagged as being a selenoprotein (UGA => selenon). TRANSCRIPT_TAG_SELENOPROTEIN = 6; // Member of GENCODE Primary TRANSCRIPT_TAG_GENCODE_PRIMARY = 7; // catchall for other tags TRANSCRIPT_TAG_OTHER = 8; } // Store information about a transcript. message Transcript { // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1. string id = 1; // HGNC symbol, e.g., `"BRCA1"` string gene_symbol = 2; // HGNC gene identifier, e.g., `"1100"` for BRCA1. string gene_id = 3; // Transcript biotype. TranscriptBiotype biotype = 4; // Transcript flags. repeated TranscriptTag tags = 5; // Identifier of the corresponding protein. optional string protein = 6; // CDS start codon. optional int32 start_codon = 7; // CDS stop codon. optional int32 stop_codon = 8; // Alignments on the different genome builds. repeated GenomeAlignment genome_alignments = 9; // Whether this transcript has an issue (e.g. MissingStopCodon), cf. `mehari::db::create::mod::Reason`. optional bool filtered = 10; // Reason for filtering. optional uint32 filter_reason = 11; } // Enumeration for the known genome builds. enum GenomeBuild { // unknown GENOME_BUILD_UNKNOWN = 0; // GRCH37. GENOME_BUILD_GRCH37 = 1; // GRCh38. GENOME_BUILD_GRCH38 = 2; } // Enumeration for the two strands of the genome. enum Strand { // unknown STRAND_UNKNOWN = 0; // Forward / plus STRAND_PLUS = 1; // Reverse / minus STRAND_MINUS = 2; } // Store information about a transcript aligning to a genome. message GenomeAlignment { // The genome build identifier. GenomeBuild genome_build = 1; // Accession of the contig sequence. string contig = 2; // CDS end position, `-1` to indicate `None`. optional int32 cds_start = 3; // CDS end position, `-1` to indicate `None`. optional int32 cds_end = 4; // The strand. Strand strand = 5; // Exons of the alignment. repeated ExonAlignment exons = 6; } // Store the alignment of one exon to the reference. message ExonAlignment { // Start position on reference. int32 alt_start_i = 1; // End position on reference. int32 alt_end_i = 2; // Exon number. int32 ord = 3; // CDS start coordinate. optional int32 alt_cds_start_i = 4; // CDS end coordinate. optional int32 alt_cds_end_i = 5; // CIGAR string of alignment, empty indicates full matches. string cigar = 6; } // Database of transcripts with sequences. message TxSeqDatabase { // Store transcripts with their aliases. TranscriptDb tx_db = 1; // Store sequence with their aliases. SequenceDb seq_db = 2; // The version of the database. optional string version = 3; // Version information; allow repeated here to be able to keep track of information when merging databases repeated SourceVersion source_version = 5; }