#!/usr/bin/env python """Helper script to convert the HelixMtDb format to VCF. The resulting file will look like this:: ##fileformat=VCFv4.2 ##contig= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO chrM 5 . A C . PASS AN=196554;AC_hom=1;AC_het=0 chrM 10 . T C . PASS AN=196554;AC_hom=7;AC_het=1 chrM 11 . C T . PASS AN=196554;AC_hom=0;AC_het=1 """ import csv import json import sys import vcfpy #: Number of individiduals in HelixMtDb AN = 196_554 def build_writer(): """Build ``vcfpy`` writer for writing out the VCF file.""" header = vcfpy.Header(samples=vcfpy.SamplesInfos(sample_names=[])) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2")) header.add_contig_line({"ID": "chrM", "length": 16569}) header.add_filter_line({"ID": "PASS", "Description": "Variant passes all filters"}) header.add_info_line( { "ID": "AN", "Number": 1, "Type": "Integer", "Description": "Overall allele number (Number of alleles with non-missing genotype)", } ) header.add_info_line( { "ID": "AC_hom", "Number": 1, "Type": "Integer", "Description": "Allele counts called as homoplasmic", } ) header.add_info_line( { "ID": "AC_het", "Number": 1, "Type": "Integer", "Description": "Alelle counts called as heteroplasmic", } ) return vcfpy.Writer(sys.stdout, header) def main(): writer = build_writer() reader = csv.DictReader(sys.stdin, delimiter="\t") for row in reader: locus = row["locus"].split(":") chrom, pos = locus[0], int(locus[1]) alleles = json.loads(row["alleles"]) ac_hom = int(row["counts_hom"]) ac_het = int(row["counts_het"]) writer.write_record( vcfpy.Record( CHROM=chrom, POS=pos, ID=[], REF=alleles[0], ALT=[vcfpy.Substitution(vcfpy.SNV, alleles[1])], QUAL=None, FILTER=["PASS"], INFO={ "AN": AN, "AC_hom": ac_hom, "AC_het": ac_het, }, FORMAT=None, calls=None, ) ) if __name__ == "__main__": sys.exit(main())