syntax = "proto3"; package annonars.gnomad.gnomad2; import "annonars/gnomad/vep_gnomad2.proto"; // Protocol buffer enum for site-level filters. enum Filter { // unknown FILTER_UNKNOWN = 0; // Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < // 0.2 for het calls). FILTER_ALLELE_COUNT_IS_ZERO = 1; // InbreedingCoeff < -0.3. FILTER_INBREEDING_COEFF = 2; // Passed all variant filters FILTER_PASS = 3; // Failed random forest filtering thresholds of 0.055272738028512555, 0.20641025579497013 // (probabilities of being a true positive variant) for SNPs, indels FILTER_RANDOM_FOREST = 4; } // Store the relevant allele counts and frequencies in a given sub cohort. message AlleleCounts { // Number of alternate alleles in sub cohort. int32 ac = 1; // Total number of alleles in the sub cohort. int32 an = 2; // Number of homozygous alternate alleles in the sub cohort. int32 nhomalt = 3; // Alternate allele frequency in the sub cohort. float af = 4; } // Store the allele counts for the given sub cohort and sub cohort factored by sex. message AlleleCountsBySex { // Overall allele counts in the sub cohort. AlleleCounts overall = 1; // Allele counts in female/XX karyotype individuals of sub cohort. optional AlleleCounts xx = 2; // Allele counts in male/XY karyotype individuals of sub cohort. optional AlleleCounts xy = 3; } // Store the allele counts for the given population. message PopulationAlleleCounts { // Name of the population. string population = 1; // The overall allele counts and the one by sex. AlleleCountsBySex counts = 2; // The filtering allele frequency (using Poisson 95% CI). optional float faf95 = 3; // The filtering allele frequency (using Poisson 99% CI). optional float faf99 = 4; } // Store the allele counts for the given cohort. message CohortAlleleCounts { // Name of the cohort. optional string cohort = 1; // Allele counts for each population. repeated PopulationAlleleCounts by_population = 2; // Allele counts by sex. AlleleCountsBySex by_sex = 3; // Raw allele counts. AlleleCounts raw = 4; // The population with maximum AF. optional string popmax = 5; // Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and // indeterminate ancestry). optional float af_popmax = 6; // Allele count in population with maximum AF. optional int32 ac_popmax = 7; // Total number of alleles in population with maximum AF. optional int32 an_popmax = 8; // Total number of homozygous individuals in population with maximum AF. optional int32 nhomalt_popmax = 9; } // Encapsulate VCF INFO fields related to age. message AgeInfo { // Histogram of ages of individuals with a homoplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0]. repeated int32 age_hist_hom_bin_freq = 1; // Count of age values falling below lowest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_smaller = 2; // Count of age values falling above highest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_larger = 3; // Histogram of ages of individuals with a heteroplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0] repeated int32 age_hist_het_bin_freq = 4; // Count of age values falling below lowest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_smaller = 5; // Count of age values falling above highest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_larger = 6; } // Encapsulate VCF INFO fields related to depth. message DepthInfo { // Count of dp values falling above highest histogram bin edge for all individuals. optional int32 dp_hist_all_n_larger = 1; // Count of dp values falling above highest histogram bin edge for individuals with the // alternative allele optional int32 dp_hist_alt_n_larger = 2; // Histogram of dp values for all individuals; bin edges are: [0.0, 200.0, 400.0, 600.0, 800.0, // 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_all_bin_freq = 3; // Histogram of dp values for individuals with the alternative allele; bin edges are: [0.0, // 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_alt_bin_freq = 4; } // Encapsulate quality-related information. message QualityInfo { // Phred-scaled p-value of Fisher's exact test for strand bias. optional float fs = 1; // Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared // against the Hardy-Weinberg expectation. optional float inbreeding_coeff = 2; // Root mean square of the mapping quality of reads across all samples. optional float mq = 3; // Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities. optional float mq_rank_sum = 4; // Variant call confidence normalized by depth of sample reads supporting a variant. optional float qd = 5; // Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias. optional float read_pos_rank_sum = 6; // Variant was used to build the positive training set of high-quality variants for VQSR. bool vqsr_positive_train_site = 7; // Variant was used to build the negative training set of low-quality variants for VQSR. bool vqsr_negative_train_site = 8; // Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities. optional float base_q_rank_sum = 9; // Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases. optional float clipping_rank_sum = 10; // Strand bias estimated by the symmetric odds ratio test optional float sor = 11; // Depth of informative coverage for each sample; reads with MQ=255 or with bad mates are // filtered. optional int32 dp = 12; // Log-odds ratio of being a true variant versus being a false positive under the trained VQSR // Gaussian mixture model. optional float vqslod = 13; // Allele-specific worst-performing annotation in the VQSR Gaussian mixture model optional string vqsr_culprit = 14; // Variant falls within a segmental duplication region bool segdup = 15; // Variant falls within a low complexity region. bool lcr = 16; // Variant falls within a reference decoy region. bool decoy = 17; // Variant was a callset-wide doubleton that was transmitted within a family (i.e., a singleton // amongst unrelated sampes in cohort). bool transmitted_singleton = 18; // Maximum p-value over callset for binomial test of observed allele balance for a heterozygous // genotype, given expectation of AB=0.5. optional float pab_max = 19; } // Random forest related information. message RandomForestInfo { // Random forest prediction probability for a site being a true variant. float rf_tp_probability = 1; // Variant was labelled as a positive example for training of random forest model. bool rf_positive_label = 2; // Variant was labelled as a negative example for training of random forest model. bool rf_negative_label = 3; // Random forest training label. optional string rf_label = 4; // Variant was used in training random forest model. bool rf_train = 5; } // Liftover related information. message LiftoverInfo { // The REF and the ALT alleles have been reverse complemented in liftover since the mapping from // the previous reference to the current one was on the negative strand. bool reverse_complemented_alleles = 1; // The REF and the ALT alleles have been swapped in liftover due to changes in the reference. It // is possible that not all INFO annotations reflect this swap, and in the genotypes, only the // GT, PL, and AD fields have been modified. You should check the TAGS_TO_REVERSE parameter that // was used during the LiftOver to be sure. bool swapped_alleles = 2; // A list of the original alleles (including REF) of the variant prior to liftover. If the // alleles were not changed during liftover, this attribute will be omitted. repeated string original_alleles = 3; // The name of the source contig/chromosome prior to liftover. optional string original_contig = 4; // The position of the variant on the source contig prior to liftover. optional string original_start = 5; } // Variant type related information. message VariantInfo { // Variant type (snv, indel, multi-snv, multi-indel, or mixed). string variant_type = 1; // Allele type (snv, ins, del, or mixed). string allele_type = 2; // Total number of alternate alleles observed at variant locus. int32 n_alt_alleles = 3; // Variant type was mixed. bool was_mixed = 4; // Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere // in the callset. bool has_star = 5; } // Protocol buffer for the gnomAD v2 VCF record. // // The more specialized fields from the INFO column are stored in separate, optional fields such // that we don't end up with a humongous message. message Record { // Chromosome name. string chrom = 1; // 1-based start position. int32 pos = 2; // Reference allele. string ref_allele = 3; // Alternate allele. string alt_allele = 4; // Site-level filters. repeated Filter filters = 5; // VEP annotation records. repeated annonars.gnomad.vep_gnomad2.Vep vep = 6; // Variant allele counts in the different cohorts and population. // // The populations in gnomAD v2/3 are: empty for global, "controls", "non_cancer", "non_neuro", // and "non_topmed". repeated CohortAlleleCounts allele_counts = 7; // Variant (on sex chromosome) falls outside a pseudoautosomal region bool nonpar = 8; // Information on lift-over from GRCh37 to GRCh38. optional LiftoverInfo liftover_info = 9; // Random forest related information. optional RandomForestInfo rf_info = 10; // Variant-related information details. optional VariantInfo variant_info = 11; // Summary information for variant quality interpretation. optional QualityInfo quality_info = 12; // Age-related information. optional AgeInfo age_info = 13; // Depth of coverage-related information. optional DepthInfo depth_info = 14; }