'''Takes a CSV file generated from the braille charcter excel spreadsheet and
   generates unicode.yaml files as best it can given a language param'''
from typing import Dict
import csv
import re
from functools import reduce
import sys
from entities import ENTITY_NAME_TO_CHAR
import openpyxl
import pandas
sys.stdout.reconfigure(encoding='utf-8')  # type: ignore

NO_MATHML_COUNT = 0


def get_braille_values(lang: str):
    '''Writes '{lang}-unicode.yaml' and spits out a bunch of warnings to standard out'''
    with open(f"{lang}-unicode.yaml", mode='w', encoding='utf8') as out_stream:
        # with open('BrailleMathCodes Repository.xlsx', mode='r', encoding='utf-8') as file:
        with pandas.ExcelFile('BrailleMathCodes Repository.xlsx') as xls:
            sheet = pandas.read_excel(xls, sheet_name='Enhanced Unicode Other codes')
            sheet.fillna("",inplace=True)
            global NO_MATHML_COUNT
            NO_MATHML_COUNT = 0
            # csv_file = csv.DictReader(file)
            count = 0
            unicode_yaml_list = []
            for _, line in sheet.iterrows():
                if line['Character'] and line[lang]:
                    unicode_yaml_list.append((line['Character'].strip(), line[lang], line['Symbol Name']))
                    count += 1
                    continue
                if should_ignore_line(line['Symbol Name']):
                    continue
                if line[lang]:
                    char = hard_coded_name_to_unicode(line['Symbol Name'])
                    if char == '':
                        char = get_unicode(line)
                    if char:
                        if len(char) > 1:
                            print(f"Multiple chars for {line['Symbol Name']}, {char}")
                        elif len(char[0]) == 0:
                            print(f"Didn't find Unicode char for '{line['Symbol Name']}'")
                        else:
                            unicode_yaml_list.append((char[0], line[lang], line['Symbol Name']))
                            count += 1
            print(f"Total {lang} lines = {count}, no MathML = {NO_MATHML_COUNT}")
            # sort the entries, remove the duplicates and then write them out
            unicode_yaml_list = sorted(unicode_yaml_list, key=lambda entry: entry[0])
            last_char = 'xxx'   # start with impossible value
            unique_list = []
            for entry in unicode_yaml_list:
                if entry[0] != last_char:
                    unique_list.append(entry)
                    last_char = entry[0]
            print(f"  unique entries: {len(unique_list)}")
            unicode_yaml_list = list(map(lambda entry: generate_string(entry[0], entry[1], entry[2]), unique_list))
            for line in unicode_yaml_list:
                out_stream.write(line)


MATHML_ENTRIES = [
    # All the fields in the table that could have some MathML in them -- seems randomly spread out so none better than another
    "UEB MathML example",
    "BAUK MathML example",
    "Marburg Mathml example",
    "Unified Spanish mathml example",
    "Woluwe mathml example",
    "Antoine mathml example",
    "Italian mathml example",
    "Sweden mathml example",
    "Mathml  example",
    "Mathml example"
    ]


def get_unicode(line: Dict[str, str]) -> list[str]:
    '''Returns the unicode for the line.'''
    global NO_MATHML_COUNT
    name = line['Symbol Name']
    mathml_entries = map(lambda x: str(line[x]), MATHML_ENTRIES)
    non_empty_entries = list(filter(lambda x: x != '', mathml_entries))     # nothing the labeled columns
    non_empty_entries = list(filter(lambda x: '</math>' in x, non_empty_entries))  # some entries in the columns aren't MathML
    if not non_empty_entries:
        print(f"*** {line['Symbol Name']} has no MathML -- can't find Unicode char")
        NO_MATHML_COUNT += 1
        return ['']

    # make sure all the MathML is the same
    differing_entries = get_unique_list(non_empty_entries)
    differing_mos = get_unique_list(list_to_clean=list(map(lambda mathml: get_mo(mathml, name), differing_entries)))
    # if len(differing_mos) > 1:
    #     print(f"entries:\n  first entry = '{differing_mos[0]}'\n       others: {differing_mos[1:]}")
    return differing_mos


def get_unique_list(list_to_clean: list[str]) -> list[str]:
    '''Takes a list and returns a shortened list with only unique entries'''
    return reduce(lambda reduced, x: reduced+[x] if x not in reduced else reduced, list_to_clean, [])


MO_MATCH = re.compile(r'<mo[^>]*>([^<]+)</mo>')
NUMERIC_ENTITY = re.compile(r'&#x([0-9a-fA-F]{2,4});')
NAME_ENTITY = re.compile(r'&([\w]+);')


def get_mo(mathml: str, symbol_name: str) -> str:
    '''Gets the contents of an <mo>.'''
    mo_text = get_unique_list(MO_MATCH.findall(entity_to_char(mathml)))
    # exclude parens -- examples often have parens
    if not mo_text:
        print(f"get_mo: {symbol_name} -- No <mo>s in mathml={mathml}")
        return ""
    if len(mo_text) != 1:
        if '(' in mo_text:
            mo_text.remove('(')
        if ')' in mo_text:
            mo_text.remove(')')
    if len(mo_text) != 1:
        print(f"get_mo: {symbol_name} -- #mo={len(mo_text)}; mathml={mathml}")
        non_ascii_mos = list(filter(lambda x: ord(x) > 127, mo_text))     # if there is more than one, throw out the ASCII chars
        if len(non_ascii_mos) == 1:
            return non_ascii_mos[0]
    return mo_text[0]


def entity_to_char(mathml: str) -> str:
    '''Returns an actual Unicode char given its hex or named entity representation.'''
    convert_numeric = NUMERIC_ENTITY.sub(lambda x: chr(int(x.group(1), 16)), mathml)
    return NAME_ENTITY.sub(
        lambda name: ENTITY_NAME_TO_CHAR[name.group(1)] if name.group(1) in ENTITY_NAME_TO_CHAR else name.group(0),
        convert_numeric
    )


DIGIT_NAME = re.compile(r'digit ([a-z]+)')
NAME_TO_DIGIT = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
}

def hard_coded_name_to_unicode(symbol_name: str) -> str:
    '''Converts the first column "Symbol Name" to a single char string representing its Unicode char.
       Works for:
       * digits

       If no match, returns an empty string'''
    digit = DIGIT_NAME.match(symbol_name)
    if digit:
        return NAME_TO_DIGIT[digit.group(1)]
    return ''


ARC_TRIG_FUNCTIONS = re.compile(r'arc[a-z]')


def should_ignore_line(symbol_name: str) -> bool:
    '''Returns true if the line should be ignored because:
        * it is a letter -- no difference between codes
        * it is an indicator (e.g., capital letter indicator, superscript indicator)
    '''
    return (
        "letter" in symbol_name or "indicator" in symbol_name or
        "roman number" in symbol_name or
        "enlarged" in symbol_name or
        ARC_TRIG_FUNCTIONS.match(symbol_name) is not None
    )


def generate_string(ch: str, dots: str, comment: str) -> str:
    '''Generates a line in unicode.yaml'''
    if len(ch) > 1:
        print(f"""***Unicode char is more than one char: "{ch}": [t: "{dots}"] # ({comment})\n""")
        return f"""- "{ch}": [t: "{dots}"] # ({comment})"""
    if ord('0') <= ord(ch) and ord(ch) <= ord('9'):
        dots = "N" + dots
    return f""" - "{ch}": [t: "{dots}"]                # {f'0x{ord(ch):04X}'} ({comment})\n"""


# get_braille_values('Swedish')
get_braille_values('Antoine')