import codecs
import contextlib
import io
import json
import os
import re
import sys
from collections import Counter
from os.path import dirname, join, pardir, relpath
from typing import Any, Dict, List, Optional, Set, TypeVar

from . import parser
from ._vendor.funcparserlib.parser import NoParseError

text_type = str
binary_type = bytes

StringLike = TypeVar("StringLike", str, bytes)

base = join(dirname(__file__), pardir)

_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")


def clean_path(path: str) -> str:
    return relpath(path, base)


def is_subsequence(l1: List[StringLike], l2: List[StringLike]) -> bool:
    """checks if l1 is a subsequence of l2"""
    i = 0
    for x in l2:
        if l1[i] == x:
            i += 1
            if i == len(l1):
                return True
    return False


def unescape_json(obj: Any) -> Any:
    def decode_str(inp):
        """Decode \\uXXXX escapes

        This decodes \\uXXXX escapes, possibly into non-BMP characters when
        two surrogate character escapes are adjacent to each other.
        """

        # This cannot be implemented using the unicode_escape codec
        # because that requires its input be ISO-8859-1, and we need
        # arbitrary unicode as input.
        def repl(m):
            if m.group(2) is not None:
                high = int(m.group(1), 16)
                low = int(m.group(2), 16)
                if (
                    0xD800 <= high <= 0xDBFF
                    and 0xDC00 <= low <= 0xDFFF
                    and sys.maxunicode == 0x10FFFF
                ):
                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
                    return chr(cp)
                else:
                    return chr(high) + chr(low)
            else:
                return chr(int(m.group(1), 16))

        return _surrogateRe.sub(repl, inp)

    if isinstance(obj, dict):
        return {decode_str(k): unescape_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [unescape_json(x) for x in obj]
    elif isinstance(obj, text_type):
        return decode_str(obj)
    else:
        return obj


def lint_dat_format(
    path: str,
    encoding: Optional[str],
    first_header: StringLike,
    expected_headers: Optional[List[StringLike]] = None,
    input_headers: Optional[Set[StringLike]] = None,
) -> List[Dict[StringLike, StringLike]]:
    if expected_headers is not None and first_header not in expected_headers:
        raise ValueError("First header must be an expected header. (lint config error)")

    if (
        input_headers is not None
        and expected_headers is not None
        and not (set(input_headers) < set(expected_headers))
    ):
        raise ValueError(
            "Input header must be a subset of expected headers. (lint config error)"
        )

    if expected_headers is not None and len(set(expected_headers)) < len(
        expected_headers
    ):
        raise ValueError(
            "Can't expect a single header multiple times. (lint config error)"
        )

    if input_headers is None:
        input_headers = set(expected_headers)

    try:
        if encoding is not None:
            with codecs.open(path, "r", encoding=encoding) as fp:
                dat = fp.read()
                parsed = parser.parse(dat, first_header)
        else:
            with open(path, "rb") as fp:
                dat = fp.read()
                parsed = parser.parse(dat, first_header)
    except NoParseError as e:
        print("Parse error in {}, {}".format(path, e))
        return

    seen_items = {}

    for item in parsed:
        # Check we don't have duplicate headers within one item.
        headers = Counter(x[0] for x in item.data)
        headers.subtract(set(headers.elements()))  # remove one instance of each
        for header in set(headers.elements()):
            c = headers[header]
            print(
                f"Duplicate header {header!r} occurs {c+1} times in one item in {path} at line {item.lineno}"
            )

        item_dict = dict(item.data)

        # Check we only have expected headers.
        if expected_headers is not None:
            if not is_subsequence(
                list(item_dict.keys()),
                expected_headers,
            ):
                unexpected = item_dict.keys()
                print(
                    f"Unexpected item headings in {list(unexpected)!r} in {path} at line {item.lineno}"
                )

        # Check for duplicated items.
        if input_headers is not None:
            found_input = set()
            for input_header in input_headers:
                found_input.add((input_header, item_dict.get(input_header)))
        else:
            found_input = set(item_dict.items())

        first_line = seen_items.setdefault(frozenset(found_input), item.lineno)
        if first_line is not None and first_line != item.lineno:
            print(
                f"Duplicate item in {path} at line {item.lineno} previously seen on line {first_line}"
            )

    return [dict(x.data) for x in parsed]


def lint_encoding_test(path: str) -> None:
    parsed = lint_dat_format(
        path,
        None,
        b"data",
        expected_headers=[b"data", b"encoding"],
        input_headers={b"data"},
    )
    if not parsed:
        # We'll already have output if there's a parse error.
        return

    # We'd put extra linting here, if we ever have anything specific to the
    # encoding tests here.


def lint_encoding_tests(path: str) -> None:
    for root, dirs, files in os.walk(path):
        for file in sorted(files):
            if not file.endswith(".dat"):
                continue
            lint_encoding_test(clean_path(join(root, file)))


def lint_tokenizer_test(path: str) -> None:
    all_keys = {
        "description",
        "input",
        "output",
        "initialStates",
        "lastStartTag",
        "ignoreErrorOrder",
        "doubleEscaped",
        "errors",
    }
    required = {"input", "output"}
    with codecs.open(path, "r", "utf-8") as fp:
        parsed = json.load(fp)
    if not parsed:
        return
    if not isinstance(parsed, dict):
        print("Top-level must be an object in %s" % path)
        return
    for test_group in parsed.values():
        if not isinstance(test_group, list):
            print("Test groups must be a lists in %s" % path)
            continue
        for test in test_group:
            if "doubleEscaped" in test and test["doubleEscaped"] is True:
                test = unescape_json(test)
            keys = set(test.keys())
            if not (required <= keys):
                print(
                    "missing test properties {!r} in {}".format(required - keys, path)
                )
            if not (keys <= all_keys):
                print(
                    "unknown test properties {!r} in {}".format(keys - all_keys, path)
                )


def lint_tokenizer_tests(path: str) -> None:
    for root, dirs, files in os.walk(path):
        for file in sorted(files):
            if not file.endswith(".test"):
                continue
            lint_tokenizer_test(clean_path(join(root, file)))


def lint_tree_construction_test(path: str) -> None:
    parsed = lint_dat_format(
        path,
        "utf-8",
        "data",
        expected_headers=[
            "data",
            "errors",
            "new-errors",
            "document-fragment",
            "script-off",
            "script-on",
            "document",
        ],
        input_headers={
            "data",
            "document-fragment",
            "script-on",
            "script-off",
        },
    )
    if not parsed:
        # We'll already have output if there's a parse error.
        return

    # We'd put extra linting here, if we ever have anything specific to the
    # tree construction tests here.


def lint_tree_construction_tests(path: str) -> None:
    for root, dirs, files in os.walk(path):
        for file in sorted(files):
            if not file.endswith(".dat"):
                continue
            lint_tree_construction_test(clean_path(join(root, file)))


def main() -> int:
    with contextlib.redirect_stdout(io.StringIO()) as f:
        lint_encoding_tests(join(base, "encoding"))
        lint_tokenizer_tests(join(base, "tokenizer"))
        lint_tree_construction_tests(join(base, "tree-construction"))

    print(f.getvalue(), end="")
    return 0 if f.getvalue() == "" else 1


if __name__ == "__main__":
    sys.exit(main())