#!/usr/bin/env python
"""Helper script to convert ClinCNV files to VCF format for import."""

import csv
import enum
from typing import Annotated
from pathlib import Path
import logging

from bioutils import assemblies
import pydantic
import logzero
from logzero import logger
import typer
import vcfpy


@enum.unique
class GenomeRelease(enum.Enum):
    """Enumeration for genome releases"""

    #: GRCh37 release
    GRCH37 = "GRCh37"
    #: GRCh38 release
    GRCH38 = "GRCh38"


#: Canonical contig names.
CANONICAL_CONTIGS = [
    *[str(i) for i in range(1, 23)],
    "X", "Y", "MT"
]

def get_clincnv_version(path: Path) -> str:
    """Get ClinCNV version from file."""
    with path.open("rt") as inputf:
        for line in inputf:
            if line.startswith("##ClinCNV version:"):
                return line.strip().split(": ")[1].strip()
            elif not line.startswith("#"):
                break
    raise RuntimeError("Could not determine ClinCNV version")


def create_header(
    sample_name: str, genome_release: GenomeRelease, clincnv_version: str
) -> vcfpy.Header:
    """Create header with the given values."""
    header = vcfpy.Header(samples=vcfpy.SamplesInfos([sample_name]))
    header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))
    header.add_line(vcfpy.HeaderLine(key="source", value=f"ClinCNV {clincnv_version}"))
    # FILTER
    header.add_filter_line({"ID": "PASS", "Description": "All filters passed"})
    header.add_filter_line(
        {
            "ID": "LowQual",
            "Description": "Loglikelyhood (reported as GQ) is less than 20q",
        }
    )
    # INFO
    header.add_info_line(
        {
            "ID": "SVLEN",
            "Number": ".",
            "Type": "Integer",
            "Description": "Difference in length between REF and ALT alleles",
        }
    )
    header.add_info_line(
        {
            "ID": "END",
            "Number": 1,
            "Type": "Integer",
            "Description": "End position of the variant described in this record",
        }
    )
    header.add_info_line(
        {
            "ID": "SVTYPE",
            "Number": 1,
            "Type": "String",
            "Description": "Type of structural variant",
        }
    )
    # FORMAT
    header.add_format_line(
        {
            "ID": "CN",
            "Number": 1,
            "Type": "Integer",
            "Description": "Segment most-likely copy-number call",
        }
    )
    header.add_format_line(
        {
            "ID": "GT",
            "Number": 1,
            "Type": "String",
            "Description": "Segment genotype 0 or 1",
        }
    )
    header.add_format_line(
        {"ID": "NP", "Number": 1, "Type": "Integer", "Description": "Number of regions"}
    )
    header.add_format_line(
        {
            "ID": "GQ",
            "Number": 1,
            "Type": "Integer",
            "Description": "Loglikelyhood of call",
        }
    )
    # CONTIG
    if genome_release == GenomeRelease.GRCH37:
        # we need p13 as we want chrMT
        assembly_info = assemblies.get_assembly("GRCh37.p12")
    else:
        assembly_info = assemblies.get_assembly("GRCh38")
    for seq in assembly_info["sequences"]:
        if seq["name"].replace("chr", "") in CANONICAL_CONTIGS:
            header.add_contig_line(
                {
                    "ID": seq["name"] if genome_release == GenomeRelease.GRCH37 else f"chr{seq['name']}",
                    "length": seq["length"],
                    "assembly": genome_release.value,
                }
            )

    return header


class ClinCnvRecord(pydantic.BaseModel):
    chr: str
    start: int
    end: int
    cn_change: int = pydantic.Field(validation_alias="CN_change")
    loglikelihood: int
    no_of_regions: int
    length_kb: str = pydantic.Field(validation_alias="length_KB")
    potential_af: str = pydantic.Field(validation_alias="potential_AF")
    genes: str
    qvalue: str
    overlap_af_genomes_imgag: str = pydantic.Field(validation_alias="overlap af_genomes_imgag")
    cn_pathogenic: str
    dosage_sensitive_disease_genes: str
    clinvar_cnvs: str
    omim: str
    gene_info: str
    ngsd_pathogenic_cnvs: str


def run_processing(
    path_in: Path,
    path_out: Path,
    header: vcfpy.Header,
):
    """Run the actual processing"""
    logger.info("    - skipping header")
    with path_in.open("rt") as inputf:
        # skip over ## header
        while True:
            pos = inputf.tell()
            line = inputf.readline()
            if line is None:
                break
            if not line.startswith("##"):
                break
        inputf.seek(pos)
        inputf.read(1)  # skip comment from #chr... header

        reader = csv.DictReader(inputf, delimiter="\t")
        with vcfpy.Writer.from_path(path_out, header) as writer:
            for raw_record in reader:
                clincnv_record = ClinCnvRecord(**raw_record)
                record = vcfpy.Record(
                    CHROM=clincnv_record.chr,
                    POS=clincnv_record.start,
                    ID=[],
                    QUAL=None,
                    REF="N",
                    ALT=[vcfpy.SymbolicAllele("DEL" if clincnv_record.cn_change < 2 else "DUP")],
                    FILTER=[] if clincnv_record.loglikelihood >= 20 else ["LowQual"],
                    INFO={
                        "END": clincnv_record.end,
                        "SVTYPE": "DEL" if clincnv_record.cn_change <2 else "DUP",
                        "SVLEN": [clincnv_record.end - clincnv_record.start + 1],
                    },
                    FORMAT=["GT", "CN", "GQ", "NP"],
                    calls=[vcfpy.Call(sample=header.samples.names[0], data={
                        "GT": "1",
                        "CN": clincnv_record.cn_change,
                        "GQ": clincnv_record.loglikelihood,
                        "NP": clincnv_record.no_of_regions,
                    })]
                )

                writer.write_record(record)


def main(
    path_in: Annotated[Path, typer.Option(help="Path to input TSV file")],
    path_out: Annotated[Path, typer.Option(help="Path to output VCF")],
    sample_name: Annotated[str, typer.Option(help="Sample name to use in VCF header")],
    genome_release: Annotated[
        GenomeRelease,
        typer.Option(
            help="Genome release to use for VCF header",
            show_default=True,
        ),
    ],
    verbose: Annotated[
        bool, typer.Option("--verbose", "-v", help="Enable verbose output")
    ] = False,
):
    """Convert ClinCNV TSV file to VCF format.

    Notes:

    Loglikelyhood is written out as in GQ tag.
    """
    # Setup logging
    if verbose:  # pragma: no cover
        level = logging.DEBUG
    else:
        # Remove module name and line number if not running in debug mode.s
        formatter = logzero.LogFormatter(
            fmt="%(color)s[%(levelname)1.1s %(asctime)s]%(end_color)s %(message)s"
        )
        logzero.formatter(formatter)
        level = logging.INFO
    logzero.loglevel(level=level)

    logger.info("Starting conversion...")
    logger.info("  - getting ClinCNV version")
    clincnv_version = get_clincnv_version(path_in)
    logger.info("  - creating header")
    header = create_header(sample_name, genome_release, clincnv_version)
    logger.info("  - processing records")
    run_processing(path_in, path_out, header)
    logger.info("... done with conversion")

    logger.info("All done. Have a nice day!")


if __name__ == "__main__":
    typer.run(main)