'''Takes a CSV file generated from the braille charcter excel spreadsheet and generates unicode.yaml files as best it can given a language param''' from typing import Dict import csv import re from functools import reduce import sys from entities import ENTITY_NAME_TO_CHAR import openpyxl import pandas sys.stdout.reconfigure(encoding='utf-8') # type: ignore NO_MATHML_COUNT = 0 def get_braille_values(lang: str): '''Writes '{lang}-unicode.yaml' and spits out a bunch of warnings to standard out''' with open(f"{lang}-unicode.yaml", mode='w', encoding='utf8') as out_stream: # with open('BrailleMathCodes Repository.xlsx', mode='r', encoding='utf-8') as file: with pandas.ExcelFile('BrailleMathCodes Repository.xlsx') as xls: sheet = pandas.read_excel(xls, sheet_name='Enhanced Unicode Other codes') sheet.fillna("",inplace=True) global NO_MATHML_COUNT NO_MATHML_COUNT = 0 # csv_file = csv.DictReader(file) count = 0 unicode_yaml_list = [] for _, line in sheet.iterrows(): if line['Character'] and line[lang]: unicode_yaml_list.append((line['Character'].strip(), line[lang], line['Symbol Name'])) count += 1 continue if should_ignore_line(line['Symbol Name']): continue if line[lang]: char = hard_coded_name_to_unicode(line['Symbol Name']) if char == '': char = get_unicode(line) if char: if len(char) > 1: print(f"Multiple chars for {line['Symbol Name']}, {char}") elif len(char[0]) == 0: print(f"Didn't find Unicode char for '{line['Symbol Name']}'") else: unicode_yaml_list.append((char[0], line[lang], line['Symbol Name'])) count += 1 print(f"Total {lang} lines = {count}, no MathML = {NO_MATHML_COUNT}") # sort the entries, remove the duplicates and then write them out unicode_yaml_list = sorted(unicode_yaml_list, key=lambda entry: entry[0]) last_char = 'xxx' # start with impossible value unique_list = [] for entry in unicode_yaml_list: if entry[0] != last_char: unique_list.append(entry) last_char = entry[0] print(f" unique entries: {len(unique_list)}") unicode_yaml_list = list(map(lambda entry: generate_string(entry[0], entry[1], entry[2]), unique_list)) for line in unicode_yaml_list: out_stream.write(line) MATHML_ENTRIES = [ # All the fields in the table that could have some MathML in them -- seems randomly spread out so none better than another "UEB MathML example", "BAUK MathML example", "Marburg Mathml example", "Unified Spanish mathml example", "Woluwe mathml example", "Antoine mathml example", "Italian mathml example", "Sweden mathml example", "Mathml example", "Mathml example" ] def get_unicode(line: Dict[str, str]) -> list[str]: '''Returns the unicode for the line.''' global NO_MATHML_COUNT name = line['Symbol Name'] mathml_entries = map(lambda x: str(line[x]), MATHML_ENTRIES) non_empty_entries = list(filter(lambda x: x != '', mathml_entries)) # nothing the labeled columns non_empty_entries = list(filter(lambda x: '' in x, non_empty_entries)) # some entries in the columns aren't MathML if not non_empty_entries: print(f"*** {line['Symbol Name']} has no MathML -- can't find Unicode char") NO_MATHML_COUNT += 1 return [''] # make sure all the MathML is the same differing_entries = get_unique_list(non_empty_entries) differing_mos = get_unique_list(list_to_clean=list(map(lambda mathml: get_mo(mathml, name), differing_entries))) # if len(differing_mos) > 1: # print(f"entries:\n first entry = '{differing_mos[0]}'\n others: {differing_mos[1:]}") return differing_mos def get_unique_list(list_to_clean: list[str]) -> list[str]: '''Takes a list and returns a shortened list with only unique entries''' return reduce(lambda reduced, x: reduced+[x] if x not in reduced else reduced, list_to_clean, []) MO_MATCH = re.compile(r']*>([^<]+)') NUMERIC_ENTITY = re.compile(r'&#x([0-9a-fA-F]{2,4});') NAME_ENTITY = re.compile(r'&([\w]+);') def get_mo(mathml: str, symbol_name: str) -> str: '''Gets the contents of an .''' mo_text = get_unique_list(MO_MATCH.findall(entity_to_char(mathml))) # exclude parens -- examples often have parens if not mo_text: print(f"get_mo: {symbol_name} -- No s in mathml={mathml}") return "" if len(mo_text) != 1: if '(' in mo_text: mo_text.remove('(') if ')' in mo_text: mo_text.remove(')') if len(mo_text) != 1: print(f"get_mo: {symbol_name} -- #mo={len(mo_text)}; mathml={mathml}") non_ascii_mos = list(filter(lambda x: ord(x) > 127, mo_text)) # if there is more than one, throw out the ASCII chars if len(non_ascii_mos) == 1: return non_ascii_mos[0] return mo_text[0] def entity_to_char(mathml: str) -> str: '''Returns an actual Unicode char given its hex or named entity representation.''' convert_numeric = NUMERIC_ENTITY.sub(lambda x: chr(int(x.group(1), 16)), mathml) return NAME_ENTITY.sub( lambda name: ENTITY_NAME_TO_CHAR[name.group(1)] if name.group(1) in ENTITY_NAME_TO_CHAR else name.group(0), convert_numeric ) DIGIT_NAME = re.compile(r'digit ([a-z]+)') NAME_TO_DIGIT = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', } def hard_coded_name_to_unicode(symbol_name: str) -> str: '''Converts the first column "Symbol Name" to a single char string representing its Unicode char. Works for: * digits If no match, returns an empty string''' digit = DIGIT_NAME.match(symbol_name) if digit: return NAME_TO_DIGIT[digit.group(1)] return '' ARC_TRIG_FUNCTIONS = re.compile(r'arc[a-z]') def should_ignore_line(symbol_name: str) -> bool: '''Returns true if the line should be ignored because: * it is a letter -- no difference between codes * it is an indicator (e.g., capital letter indicator, superscript indicator) ''' return ( "letter" in symbol_name or "indicator" in symbol_name or "roman number" in symbol_name or "enlarged" in symbol_name or ARC_TRIG_FUNCTIONS.match(symbol_name) is not None ) def generate_string(ch: str, dots: str, comment: str) -> str: '''Generates a line in unicode.yaml''' if len(ch) > 1: print(f"""***Unicode char is more than one char: "{ch}": [t: "{dots}"] # ({comment})\n""") return f"""- "{ch}": [t: "{dots}"] # ({comment})""" if ord('0') <= ord(ch) and ord(ch) <= ord('9'): dots = "N" + dots return f""" - "{ch}": [t: "{dots}"] # {f'0x{ord(ch):04X}'} ({comment})\n""" # get_braille_values('Swedish') get_braille_values('Antoine')