'''Takes a CSV file generated from the braille charcter excel spreadsheet and
generates unicode.yaml files as best it can given a language param'''
from typing import Dict
import csv
import re
from functools import reduce
import sys
from entities import ENTITY_NAME_TO_CHAR
import openpyxl
import pandas
sys.stdout.reconfigure(encoding='utf-8') # type: ignore
NO_MATHML_COUNT = 0
def get_braille_values(lang: str):
'''Writes '{lang}-unicode.yaml' and spits out a bunch of warnings to standard out'''
with open(f"{lang}-unicode.yaml", mode='w', encoding='utf8') as out_stream:
# with open('BrailleMathCodes Repository.xlsx', mode='r', encoding='utf-8') as file:
with pandas.ExcelFile('BrailleMathCodes Repository.xlsx') as xls:
sheet = pandas.read_excel(xls, sheet_name='Enhanced Unicode Other codes')
sheet.fillna("",inplace=True)
global NO_MATHML_COUNT
NO_MATHML_COUNT = 0
# csv_file = csv.DictReader(file)
count = 0
unicode_yaml_list = []
for _, line in sheet.iterrows():
if line['Character'] and line[lang]:
unicode_yaml_list.append((line['Character'].strip(), line[lang], line['Symbol Name']))
count += 1
continue
if should_ignore_line(line['Symbol Name']):
continue
if line[lang]:
char = hard_coded_name_to_unicode(line['Symbol Name'])
if char == '':
char = get_unicode(line)
if char:
if len(char) > 1:
print(f"Multiple chars for {line['Symbol Name']}, {char}")
elif len(char[0]) == 0:
print(f"Didn't find Unicode char for '{line['Symbol Name']}'")
else:
unicode_yaml_list.append((char[0], line[lang], line['Symbol Name']))
count += 1
print(f"Total {lang} lines = {count}, no MathML = {NO_MATHML_COUNT}")
# sort the entries, remove the duplicates and then write them out
unicode_yaml_list = sorted(unicode_yaml_list, key=lambda entry: entry[0])
last_char = 'xxx' # start with impossible value
unique_list = []
for entry in unicode_yaml_list:
if entry[0] != last_char:
unique_list.append(entry)
last_char = entry[0]
print(f" unique entries: {len(unique_list)}")
unicode_yaml_list = list(map(lambda entry: generate_string(entry[0], entry[1], entry[2]), unique_list))
for line in unicode_yaml_list:
out_stream.write(line)
MATHML_ENTRIES = [
# All the fields in the table that could have some MathML in them -- seems randomly spread out so none better than another
"UEB MathML example",
"BAUK MathML example",
"Marburg Mathml example",
"Unified Spanish mathml example",
"Woluwe mathml example",
"Antoine mathml example",
"Italian mathml example",
"Sweden mathml example",
"Mathml example",
"Mathml example"
]
def get_unicode(line: Dict[str, str]) -> list[str]:
'''Returns the unicode for the line.'''
global NO_MATHML_COUNT
name = line['Symbol Name']
mathml_entries = map(lambda x: str(line[x]), MATHML_ENTRIES)
non_empty_entries = list(filter(lambda x: x != '', mathml_entries)) # nothing the labeled columns
non_empty_entries = list(filter(lambda x: '' in x, non_empty_entries)) # some entries in the columns aren't MathML
if not non_empty_entries:
print(f"*** {line['Symbol Name']} has no MathML -- can't find Unicode char")
NO_MATHML_COUNT += 1
return ['']
# make sure all the MathML is the same
differing_entries = get_unique_list(non_empty_entries)
differing_mos = get_unique_list(list_to_clean=list(map(lambda mathml: get_mo(mathml, name), differing_entries)))
# if len(differing_mos) > 1:
# print(f"entries:\n first entry = '{differing_mos[0]}'\n others: {differing_mos[1:]}")
return differing_mos
def get_unique_list(list_to_clean: list[str]) -> list[str]:
'''Takes a list and returns a shortened list with only unique entries'''
return reduce(lambda reduced, x: reduced+[x] if x not in reduced else reduced, list_to_clean, [])
MO_MATCH = re.compile(r']*>([^<]+)')
NUMERIC_ENTITY = re.compile(r'([0-9a-fA-F]{2,4});')
NAME_ENTITY = re.compile(r'&([\w]+);')
def get_mo(mathml: str, symbol_name: str) -> str:
'''Gets the contents of an .'''
mo_text = get_unique_list(MO_MATCH.findall(entity_to_char(mathml)))
# exclude parens -- examples often have parens
if not mo_text:
print(f"get_mo: {symbol_name} -- No s in mathml={mathml}")
return ""
if len(mo_text) != 1:
if '(' in mo_text:
mo_text.remove('(')
if ')' in mo_text:
mo_text.remove(')')
if len(mo_text) != 1:
print(f"get_mo: {symbol_name} -- #mo={len(mo_text)}; mathml={mathml}")
non_ascii_mos = list(filter(lambda x: ord(x) > 127, mo_text)) # if there is more than one, throw out the ASCII chars
if len(non_ascii_mos) == 1:
return non_ascii_mos[0]
return mo_text[0]
def entity_to_char(mathml: str) -> str:
'''Returns an actual Unicode char given its hex or named entity representation.'''
convert_numeric = NUMERIC_ENTITY.sub(lambda x: chr(int(x.group(1), 16)), mathml)
return NAME_ENTITY.sub(
lambda name: ENTITY_NAME_TO_CHAR[name.group(1)] if name.group(1) in ENTITY_NAME_TO_CHAR else name.group(0),
convert_numeric
)
DIGIT_NAME = re.compile(r'digit ([a-z]+)')
NAME_TO_DIGIT = {
'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
}
def hard_coded_name_to_unicode(symbol_name: str) -> str:
'''Converts the first column "Symbol Name" to a single char string representing its Unicode char.
Works for:
* digits
If no match, returns an empty string'''
digit = DIGIT_NAME.match(symbol_name)
if digit:
return NAME_TO_DIGIT[digit.group(1)]
return ''
ARC_TRIG_FUNCTIONS = re.compile(r'arc[a-z]')
def should_ignore_line(symbol_name: str) -> bool:
'''Returns true if the line should be ignored because:
* it is a letter -- no difference between codes
* it is an indicator (e.g., capital letter indicator, superscript indicator)
'''
return (
"letter" in symbol_name or "indicator" in symbol_name or
"roman number" in symbol_name or
"enlarged" in symbol_name or
ARC_TRIG_FUNCTIONS.match(symbol_name) is not None
)
def generate_string(ch: str, dots: str, comment: str) -> str:
'''Generates a line in unicode.yaml'''
if len(ch) > 1:
print(f"""***Unicode char is more than one char: "{ch}": [t: "{dots}"] # ({comment})\n""")
return f"""- "{ch}": [t: "{dots}"] # ({comment})"""
if ord('0') <= ord(ch) and ord(ch) <= ord('9'):
dots = "N" + dots
return f""" - "{ch}": [t: "{dots}"] # {f'0x{ord(ch):04X}'} ({comment})\n"""
# get_braille_values('Swedish')
get_braille_values('Antoine')