// Protobufs for gnomAD v3. // // In the case of smaller additions in **both exomes and genomes** in v4.0, // we extended the protobufs here and re-used them. syntax = "proto3"; package annonars.gnomad.gnomad3; import "annonars/gnomad/vep_gnomad3.proto"; // Protocol buffer enum for site-level filters. enum Filter { // unknown FILTER_UNKNOWN = 0; // Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < // 0.2 for het calls). FILTER_ALLELE_COUNT_IS_ZERO = 1; // Failed VQSR filtering thresholds of: // // gnomAD-genomes v3: -2.7739 for SNPs and -1.0606 for indels // gnomAD-genomes v4: -2.502 for SNPs and -0.7156 for indels // gnomAD-exomes v4: -1.4526 for SNPs and 0.0717 for indels FILTER_AS_VSQR = 2; // InbreedingCoeff < -0.3. FILTER_INBREEDING_COEFF = 3; // Passed all variant filters FILTER_PASS = 4; } // Store details on variant effect predictions. message EffectInfo { // PrimateAI's deleteriousness score from 0 (less deleterious) to 1 (more deleterious). optional float primate_ai_score = 1; // dbNSFP's Revel score from 0 to 1. Variants with higher scores are predicted to be // more likely to be deleterious. optional float revel_score = 2; // Illumina's SpliceAI max delta score; interpreted as the probability of the variant // being splice-altering. optional float splice_ai_max_ds = 3; // The consequence term associated with the max delta score in 'splice_ai_max_ds'. optional string splice_ai_consequence = 4; // Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious. optional float cadd_raw = 5; // Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious. optional float cadd_phred = 6; } // Store the relevant allele counts and frequencies in a given sub cohort. message AlleleCounts { // Number of alternate alleles in sub cohort. int32 ac = 1; // Total number of alleles in the sub cohort. int32 an = 2; // Number of homozygous alternate alleles in the sub cohort. int32 nhomalt = 3; // Alternate allele frequency in the sub cohort. float af = 4; } // Store the allele counts for the given sub cohort and sub cohort factored by sex. message AlleleCountsBySex { // Overall allele counts in the sub cohort. AlleleCounts overall = 1; // Allele counts in female/XX karyotype individuals of sub cohort. optional AlleleCounts xx = 2; // Allele counts in male/XY karyotype individuals of sub cohort. optional AlleleCounts xy = 3; } // Store the allele counts for the given sub cohort in the given population. message PopulationAlleleCounts { // Name of the population. string population = 1; // The overall allele counts and the one by sex. AlleleCountsBySex counts = 2; // The filtering allele frequency (using Poisson 95% CI). optional float faf95 = 3; // The filtering allele frequency (using Poisson 99% CI). optional float faf99 = 4; // The filtering allele frequency for XX samples (using Poisson 95% CI). optional float faf95_xx = 5; // The filtering allele frequency for XX samples (using Poisson 99% CI). optional float faf99_xx = 6; // The filtering allele frequency for XY samples (using Poisson 95% CI). optional float faf95_xy = 7; // The filtering allele frequency for XY samples (using Poisson 99% CI). optional float faf99_xy = 8; } // Store the allele counts for the given cohort. message CohortAlleleCounts { // Name of the cohort. optional string cohort = 1; // Allele counts for each population. repeated PopulationAlleleCounts by_population = 2; // Allele counts by sex. AlleleCountsBySex by_sex = 3; // Raw allele counts. AlleleCounts raw = 4; // The population with maximum AF. optional string popmax = 5; // Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and // indeterminate ancestry). optional float af_popmax = 6; // Allele count in population with maximum AF. optional int32 ac_popmax = 7; // Total number of alleles in population with maximum AF. optional int32 an_popmax = 8; // Total number of homozygous individuals in population with maximum AF. optional int32 nhomalt_popmax = 9; } // Encapsulate VCF INFO fields related to age. message AgeInfo { // Histogram of ages of individuals with a homoplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0]. repeated int32 age_hist_hom_bin_freq = 1; // Count of age values falling below lowest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_smaller = 2; // Count of age values falling above highest histogram bin edge for individuals with a // homoplasmic variant. optional int32 age_hist_hom_n_larger = 3; // Histogram of ages of individuals with a heteroplasmic variant; bin edges are: [30.0, 35.0, // 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0] repeated int32 age_hist_het_bin_freq = 4; // Count of age values falling below lowest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_smaller = 5; // Count of age values falling above highest histogram bin edge for individuals with a // heteroplasmic variant. optional int32 age_hist_het_n_larger = 6; } // Encapsulate VCF INFO fields related to depth. message DepthInfo { // Count of dp values falling above highest histogram bin edge for all individuals. optional int32 dp_hist_all_n_larger = 1; // Count of dp values falling above highest histogram bin edge for individuals with the // alternative allele optional int32 dp_hist_alt_n_larger = 2; // Histogram of dp values for all individuals; bin edges are: [0.0, 200.0, 400.0, 600.0, 800.0, // 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_all_bin_freq = 3; // Histogram of dp values for individuals with the alternative allele; bin edges are: [0.0, // 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, 1600.0, 1800.0, 2000.0] repeated int32 dp_hist_alt_bin_freq = 4; } // Encapsulate quality-related information. message QualityInfo { // Allele-specific phred-scaled p-value of Fisher's exact test for strand bias. optional float as_fs = 1; // Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared // against the Hardy-Weinberg expectation. optional float inbreeding_coeff = 2; // Allele-specific root mean square of the mapping quality of reads across all samples optional float as_mq = 3; // Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities. optional float mq_rank_sum = 4; // Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read // mapping qualities. optional float as_mq_rank_sum = 5; // Allele-specific variant call confidence normalized by depth of sample reads supporting a // variant. optional float as_qd = 6; // Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias. optional float read_pos_rank_sum = 7; // Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read position bias. optional float as_read_pos_rank_sum = 8; // Allele-specific strand bias estimated by the symmetric odds ratio test. optional float as_sor = 9; // Variant was used to build the positive training set of high-quality variants for VQSR. bool positive_train_site = 10; // Variant was used to build the negative training set of low-quality variants for VQSR. bool negative_train_site = 11; // Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model. optional float as_vqslod = 12; // Allele-specific worst-performing annotation in the VQSR Gaussian mixture model. optional string as_culprit = 13; // Variant falls within a segmental duplication region. bool segdup = 14; // Variant falls within a low complexity region. bool lcr = 15; // Variant was a callset-wide doubleton that was transmitted within a family (i.e., a singleton // amongst unrelated sampes in cohort). bool transmitted_singleton = 16; // Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5. optional float as_pab_max = 17; // Allele-specific sum of PL[0] values; used to approximate the QUAL score. optional int32 as_qual_approx = 18; // Allele-specific forward/reverse read counts for strand bias tests. optional string as_sb_table = 19; // Strand bias estimated by the symmetric odds ratio test (v4 only). optional float sor = 20; } // Variant type related information. message VariantInfo { // Variant type (snv, indel, multi-snv, multi-indel, or mixed). string variant_type = 1; // Allele type (snv, ins, del, or mixed). string allele_type = 2; // Total number of alternate alleles observed at variant locus. int32 n_alt_alleles = 3; // Variant type was mixed. bool was_mixed = 4; // All samples are homozygous alternate for the variant. bool monoallelic = 5; // Depth over variant genotypes (does not include depth of reference samples). int32 var_dp = 6; // Allele-specific depth over variant genotypes (does not include depth of reference samples) (v4 only). optional int32 as_vardp = 7; } // Protocol buffer for the gnomAD-nuclear VCF record. // // The more specialized fields from the INFO column are stored in separate, optional fields such // that we don't end up with a humongous message. message Record { // Chromosome name. string chrom = 1; // 1-based start position. int32 pos = 2; // Reference allele. string ref_allele = 3; // Alternate allele. string alt_allele = 4; // Site-level filters. repeated Filter filters = 5; // VEP annotation records. repeated annonars.gnomad.vep_gnomad3.Vep vep = 6; // Variant allele counts in the different cohorts and population. // // The populations in gnomAD v2/3 are: empty for global, "controls", "non_cancer", "non_neuro", // and "non_topmed". repeated CohortAlleleCounts allele_counts = 7; // Variant (on sex chromosome) falls outside a pseudoautosomal region bool nonpar = 8; // Information on variant scores. optional EffectInfo effect_info = 9; // Variant-related information details. optional VariantInfo variant_info = 10; // Summary information for variant quality interpretation. optional QualityInfo quality_info = 11; // Age-related information. optional AgeInfo age_info = 12; // Depth of coverage-related information. optional DepthInfo depth_info = 13; }