syntax = "proto3"; package annonars.gnomad.mtdna; import "annonars/gnomad/vep_gnomad3.proto"; // Protocol buffer enum for site-level filters. enum Filter { // unknown FILTER_UNKNOWN = 0; // Variant overlaps site that is commonly reported in literature to be artifact prone. FILTER_ARTIFACT_PRONE_SITE = 1; // Allele where all samples with the variant call had at least 2 different heteroplasmic indels // called at the position. FILTER_INDEL_STACK = 2; // No-pass-genotypes site (no individuals were PASS for the variant). FILTER_NO_PASS_GENOTYPE = 3; } // Encapsulate VCF INFO fields related to quality. message QualityInfo { // Mean depth across all individuals for the site. optional float dp_mean = 1; // Mean MMQ (median mapping quality) across individuals with a variant for the site. optional float mq_mean = 2; // Mean TLOD (Log 10 likelihood ratio score of variant existing versus not existing) across // individuals with a variant for the site. optional float tlod_mean = 3; } // Encapsulate VCF INFO fields related to heteroplasmy levels. message HeteroplasmyInfo { // Histogram of number of individuals with a heteroplasmy level below 0.1, bin edges are: [0.0, // 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, // 1.0] repeated int32 heteroplasmy_below_min_het_threshold_hist = 1; // Histogram of heteroplasmy levels; bin edges are: [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, // 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0]. repeated int32 hl_hist = 2; // Present if variant is found at an overall frequency of .001 across all samples with a // heteroplasmy level > 0 and < 0.50 (includes variants <0.01 heteroplasmy which are // subsequently filtered) bool common_low_heteroplasmy = 3; // Maximum heteroplasmy level observed among all samples for that variant. float max_hl = 4; } // Encapsulate VCF INFO fields related to filter failure histograms. message FilterHistograms { // Histogram of number of individuals failing the base_qual filter (alternate allele median base // quality) across heteroplasmy levels, bin edges are: [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, // 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0] repeated int32 base_qual_hist = 1; // Histogram of number of individuals failing the position filter (median distance of alternate // variants from end of reads) across heteroplasmy levels, bin edges are: [0.0, 0.1, 0.2, 0. // 30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0] repeated int32 position_hist = 2; // Histogram of number of individuals failing the strand_bias filter (evidence for alternate // allele comes from one read direction only) across heteroplasmy levels, bin edges are: [0.0, // 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, // 1.0] repeated int32 strand_bias_hist = 3; // Histogram of number of individuals failing the weak_evidence filter (mutation does not meet // likelihood threshold) across heteroplasmy levels, bin edges are: [0.0, 0.1, 0.2, // 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0] repeated int32 weak_evidence_hist = 4; // Histogram of number of individuals failing the contamination filter across heteroplasmy // levels, bin edges are: [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, // 0.7000000000000001, 0.8, 0.9, 1.0] repeated int32 contamination_hist = 5; } // Encapsulate VCF INFO fields related to populations. message PopulationInfo { // List of overall allele number for each population, population order: ['afr', 'ami', 'amr', // 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid'] repeated int32 pop_an = 1; // List of AC_het for each population, population order: ['afr', 'ami', 'amr', 'asj', 'eas', // 'fin', 'nfe', 'oth', 'sas', 'mid'] repeated int32 pop_ac_het = 2; // List of AC_hom for each population, population order: ['afr', 'ami', 'amr', 'asj', 'eas', // 'fin', 'nfe', 'oth', 'sas', 'mid'] repeated int32 pop_ac_hom = 3; // List of AF_hom for each population, population order: ['afr', 'ami', 'amr', 'asj', 'eas', // 'fin', 'nfe', 'oth', 'sas', 'mid'] repeated float pop_af_hom = 4; // List of AF_het for each population, population order: ['afr', 'ami', 'amr', 'asj', 'eas', // 'fin', 'nfe', 'oth', 'sas', 'mid'] repeated float pop_af_het = 5; // Histogram of heteroplasmy levels for each population; bin edges are: [0.0, 0.1, 0.2, // 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0], // population order: ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid'] // // Note that we encode this by concatenating all lists here because of limitations in // protocolbuffers (no native nested repeated fields). repeated int32 pop_hl_hist = 6; } // Encapsulate VCF INFO fields related to haplogroups. message HaplogroupInfo { // Present if variant is present as a haplogroup defining variant in PhyloTree build 17. bool hap_defining_variant = 1; // List of overall allele number for each haplogroup, haplogroup order: ['A', 'B', 'C', 'D', // 'E', 'F', 'G', 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', // 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] repeated int32 hap_an = 2; // List of AC_het for each haplogroup, haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', // 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', // 'V', 'W', 'X', 'Y', 'Z'] repeated int32 hap_ac_het = 3; // List of AC_hom for each haplogroup, haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', // 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', // 'V', 'W', 'X', 'Y', 'Z'] repeated int32 hap_ac_hom = 4; // List of AF_het for each haplogroup, haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', // 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', // 'V', 'W', 'X', 'Y', 'Z'] repeated float hap_af_het = 5; // List of AF_hom for each haplogroup, haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', // 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', // 'V', 'W', 'X', 'Y', 'Z'] repeated float hap_af_hom = 6; // Histogram of heteroplasmy levels for each haplogroup; bin edges are: [0.0, 0.1, 0.2, // 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0], // haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', // 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] // // Note that we encode this by concatenating all lists here because of limitations in // protocolbuffers (no native nested repeated fields). repeated int32 hap_hl_hist = 7; // List of filtering allele frequency for each haplogroup restricted to homoplasmic variants, // haplogroup order: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', // 'L2', 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] repeated float hap_faf_hom = 8; // Haplogroup with maximum AF_hom. optional string hapmax_af_hom = 9; // Haplogroup with maximum AF_het. optional string hapmax_af_het = 10; // Maximum filtering allele frequency across haplogroups restricted to homoplasmic variants. optional float faf_hapmax_hom = 11; } // Encapsulate VCF INFO fields related to age. message AgeInfo { // Histogram of ages of individuals with a homoplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0]. repeated int32 age_hist_hom_bin_freq = 1; // Count of age values falling below lowest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_smaller = 2; // Count of age values falling above highest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_larger = 3; // Histogram of ages of individuals with a heteroplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0] repeated int32 age_hist_het_bin_freq = 4; // Count of age values falling below lowest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_smaller = 5; // Count of age values falling above highest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_larger = 6; } // Encapsulate VCF INFO fields related to depth. message DepthInfo { // Count of dp values falling above highest histogram bin edge for all individuals. optional int32 dp_hist_all_n_larger = 1; // Count of dp values falling above highest histogram bin edge for individuals with the // alternative allele optional int32 dp_hist_alt_n_larger = 2; // Histogram of dp values for all individuals; bin edges are: [0.0, 200.0, 400.0, 600.0, 800.0, // 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_all_bin_freq = 3; // Histogram of dp values for individuals with the alternative allele; bin edges are: [0.0, // 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_alt_bin_freq = 4; } // Protocol buffer for the gnomAD-mtDNA VCF record. // // The more specialized fields from the INFO column are stored in separate, optional fields such // that we don't end up with a humongous message. message Record { // Chromosome name. string chrom = 1; // 1-based start position. int32 pos = 2; // Reference allele. string ref_allele = 3; // Alternate allele. string alt_allele = 4; // Variant in format of RefPosAlt string variant_collapsed = 5; // Excluded allele count (number of individuals in which the variant was filtered out). int32 excluded_ac = 6; // Overall allele number (number of samples with non-missing genotype). int32 an = 7; // Allele count restricted to variants with a heteroplasmy level >= 0.95. int32 ac_hom = 8; // Allele count restricted to variants with a heteroplasmy level >= 0.10 and < 0.95. int32 ac_het = 9; // Allele frequency restricted to variants with a heteroplasmy level >= 0.95. float af_hom = 10; // Allele frequency restricted to variants with a heteroplasmy level >= 0.10 and < 0.95. float af_het = 11; // Site-level filters. repeated Filter filters = 12; // MitoTip raw score optional float mitotip_score = 13; // MitoTip score interpretation optional string mitotip_trna_prediction = 14; // tRNA pathogenicity classification from PON-mt-tRNA optional string pon_mt_trna_prediction = 15; // tRNA ML_probability_of_pathogenicity from PON-mt-tRNA optional string pon_ml_probability_of_pathogenicity = 16; // VEP v3 annotation records. repeated annonars.gnomad.vep_gnomad3.Vep vep = 17; // Summary information for variant quality interpretation. optional QualityInfo quality_info = 18; // Information related to heteroplasmy levels. optional HeteroplasmyInfo heteroplasmy_info = 19; // Histograms related to variant quality filters. optional FilterHistograms filter_histograms = 20; // Population-related information. optional PopulationInfo population_info = 21; // Haplogroup-related information. optional HaplogroupInfo haplogroup_info = 22; // Age-related information. optional AgeInfo age_info = 23; // Depth of coverage-related information. optional DepthInfo depth_info = 24; }