from ascii_braille import ascii_to_euro_braille from bs4 import BeautifulSoup from html_table_extractor.extractor import Extractor from typing import TextIO import sys from string import ascii_uppercase, ascii_lowercase import xml.etree.ElementTree as ET import re import json sys.stdout.reconfigure(encoding='utf-8') def create_unicode_from_latex_symbols_html(out_file: str): with open("latex-symbols.htm", encoding='utf8') as in_stream: with open(out_file, 'w', encoding='utf8') as out_stream: file_contents = BeautifulSoup(in_stream, features="html.parser") def_list = file_contents.select_one('dl') if def_list is None: print("didn't find 'dl'") return latex_list = list(map(lambda x: x.contents[0], def_list.select('dt code'))) # unicode_list = list(map(lambda x: x.find('p').contents[0].split(' ')[0], def_list.select('dd'))) foo = def_list.select('dd') unicode_list = list(map(lambda x: x.find('p').contents[0].split(' ')[0], foo)) combined = sorted(zip(unicode_list, latex_list)) for unicode, latex in combined: write_line(unicode, latex, "", False, out_stream) LATEX_COMMENT = """\ # This file is derived from a number of sources. # This tries to conform to the "spec" augenbit.de/wiki/index.php?title=LaTeX-Manual_LaTeX_Grundregeln (and linked files) # The short names come from MathLib.tex that is linked from above # Where there is not a conflict, the next source of names comes from "standard" LaTeX names listed in # en.wikipedia.org/wiki/List_of_mathematical_symbols_by_subject # Otherwise they are based on the XML entities document (w3c.github.io/xml-entities). # That references the data file github.com/w3c/xml-entities/blob/gh-pages/unicode.xml # The fields in that are used are ["mathlatex", "latex", "varlatex", "ams"], with different names added as comments. # Note: there is some filtering of unlikely names, so the above is not 100% as to the output, but it is close. """ def get_unicode_standard_symbols() -> dict[str, list[str]]: # the HTML file has rowspans in it -- hence the use of table extractor with open("List of mathematical symbols by subject.htm", encoding='utf8') as in_stream: file_contents = BeautifulSoup(in_stream, features="html.parser") tables = file_contents.select('table') if tables is None: print("didn't find 'tables'") return {} all_entries: list[tuple[str, str]] = [] i = 0 for table in tables: # print(f"table {i}") table_string = table.decode() extractor = Extractor(table_string) extractor.parse() rows = extractor.return_list() i_latex = 3 if len(rows[1]) == 6 else 4 # print(f"row 2='{rows[2]}'") for row in rows[1:]: try: unicode = row[1].text if not isinstance(unicode, str): continue unicode = unicode.strip() # print(f"unicode='{unicode}' latex='{row[i_latex]}', latex type={type(row[i_latex])}") # print(f" latex='{row[i_latex]}, type={type(row[i_latex])}") except (IndexError, TypeError, AttributeError) as err: print(f"Error in getting unicode in {row}\nError is '{err}'") if len(unicode) == 1: # filter out "det", etc latex_col = row[i_latex] for code in latex_col.select('code'): # print(f" *** code='{code.text}', type={type(code.text)}") latex: str = code.text.strip() if latex.startswith('\\'): # filter out ASCII and some other oddballs all_entries.append((unicode, latex)) i += 1 print(f'all_entries len={len(all_entries)}') answer: dict[str, list[str]] = {} for (key, val) in all_entries: if key in answer: if val not in answer[key]: answer[key].append(val) else: answer[key] = [val] print(f'answer len={len(list(answer))}') return answer UNICODE_CH_PATTERN = re.compile(r' - "(.)"') def get_unicode_yaml_chars(file: str, include_ascii: bool) -> set[str]: """Returns a set of all the chars 'file' (full path). If 'include_ascii' is False, ASCII chars are excluded. """ answer = set() with open(file, "r", encoding='utf8') as unicode_stream: for line in unicode_stream.readlines(): matched = UNICODE_CH_PATTERN.match(line) if matched and (include_ascii or ord(matched.group(1)) > 127): answer.add(matched.group(1)) for ch in range(ord('Ξ‘'), ord('Ξ©')): # these are a range in unicode.yaml, so the pattern doesn't match answer.add(chr(ch)) return answer # The chars in unicode.yaml (others go into unicode-full.yaml) UNICODE_CHARS_SHORT = get_unicode_yaml_chars("../Rules/Languages/en/unicode.yaml", False) def get_short_dict() -> dict[str, str]: with open("euro-braille-short.csv", "r", encoding='utf8') as stream: answer = {} for line in stream.readlines(): parts: list[str] = list(filter(lambda x: x != '', line.split(' '))) short_name = parts[0].strip() latex_name = parts[1].strip() answer[latex_name] = short_name for ch in ascii_lowercase + ascii_uppercase: answer[f'\\mathbb𝐖{ch}'] = f'\\{ch}' return answer CHAR_IN_BRACES = re.compile(r'(\\.+)\{(.)\}') def extract_latex(in_file): short_names = get_short_dict() standard_names = get_unicode_standard_symbols() print(f'len standard names = {len(standard_names)}') overrides = { "*": "*", "{": "\\{", "}": "\\}", "|": "|", "%": "%", "Β°": "Β°", "Ο΅": "\\epsilon", "β‰ ": "\\not=", # varepsilon "β€²": "'", "β€³": "''", "‴": "'''", "≀": "\\le", "β‰₯": "\\ge", "\\cdotp": "\\cdots", "\\varnothing": "\\emptyset", "β–³": "\\triangle", "β†’": "\\to", "‰": "\\permil", } tree = ET.parse(in_file) root: ET.Element = tree.getroot() all_char_elements = root.find("charlist") if all_char_elements is None: print(f"Didn't find XML root in {in_file}!") exit(1) with open("latex-braille-unicode.yaml", 'w', encoding='utf8') as short_stream: with open("latex-braille-unicode-full.yaml", 'w', encoding='utf8') as full_stream: short_stream.write(LATEX_COMMENT) full_stream.write(LATEX_COMMENT) short_stream.write("\n---\n") full_stream.write("\n---\n") for char_element in all_char_elements: if char_element is None: print("char_element is None!") continue ch = char_element.get("id") if ch is None: print('char_element.get("id") is None!') continue ch = convert_to_char(ch) if len(ch) > 1: continue code = ord(ch) if code < 0x20: continue # add in ASCII and the Greek block is_in_common_char_blocks = code < 0x7F or (0x0370 <= code and code <= 0x03fF) stream = short_stream if ch in UNICODE_CHARS_SHORT or is_in_common_char_blocks else full_stream # use the standard name unless the char is in the override dict # if it and the standard name is an option, write it first if ch in standard_names: latex_names = standard_names[ch] is_overridden = ch in overrides first_time = True if is_overridden: latex_name = overrides[ch] write_line(ch, latex_name, short_names.get(latex_name, ''), False, stream) first_time = False # print(f"standard name list for {ch}: {latex_names}") for latex_name in latex_names: is_commented = False if is_overridden and latex_name == overrides[ch]: continue if first_time: first_time = False else: is_commented = True # alternative name if CHAR_IN_BRACES.search(latex_name): # probably a better way to do this latex_name = CHAR_IN_BRACES.sub(lambda match: f'{match.group(1)}𝐖{match.group(2)}', latex_name,) write_line(ch, latex_name, short_names.get(latex_name, ''), is_commented, stream) continue if ch in overrides: print(f"found override for '{ch}': {overrides[ch]}") latex_name = overrides[ch] write_line(ch, latex_name, short_names.get(latex_name, ''), False, stream) continue # I wish there was a simple way to choose the names. # Based on what David Carlisle (who maintains unicode.xml) recomends, # 'math_latex' is the preferred field except for the alphabets (I only exclude Greek and math alphanumerics) # For those, math_latex is more technically correct but not what most latex users are accustomed to names_seen: list[str] = [] for style in ["mathlatex", "latex", "varlatex", "ams"]: latex_name = char_element.find(style) if latex_name is None: continue latex_name = latex_name.text if latex_name is None: continue latex_name = latex_name.strip() # the fontencoding char won't happen # the \unicode (two ellipsis entries) have short names for the latex style if latex_name.startswith('{\\fontencoding{') or latex_name.startswith('\\unicode'): continue if not latex_name.startswith('\\') and not latex_name.startswith('{') and code >= 0x7F: latex_name = '\\' + latex_name # some are missing the initial \ if latex_name.startswith('\\mathchar'): continue # seems to happen once -- not sure what that is about if style == 'mathlatex': if code < 0x7F: continue # use the latex names if 0x0370 <= code and code <= 0x03fF: continue # Greek block if 0x1D400 <= code and code <= 0x1D7FF: continue # alphanumerics if latex_name.startswith('\\Bbb'): # some blackboard chars (ℝ, etc) not in math alphanumerics continue if latex_name.startswith('\\mbox'): continue # the alternative name avoids that and so is better if latex_name.lower().find('theta') != -1: latex_name = latex_name.replace("text", "") # don't care about upright theta elif ch == '$': latex_name = '\\$' elif ch == '\\': latex_name = '\\backslash' # avoid '\textbackslash' elif latex_name.startswith("\\mitBbb"): latex_name = latex_name.replace("\\mitBbb", "") # exponential e, etc elif CHAR_IN_BRACES.search(latex_name): # probably a better way to do this latex_name = CHAR_IN_BRACES.sub(lambda match: f'{match.group(1)}𝐖{match.group(2)}', latex_name,) if latex_name in names_seen: continue is_commented = False if len(names_seen) > 0: is_commented = True # alternative name write_line(ch, latex_name, short_names.get(latex_name, ''), is_commented, stream) names_seen.append(latex_name) # write the invisible chars out short_stream.write('\n # invisible chars\n') write_line(chr(0x2061), '', '', False, short_stream) write_line(chr(0x2062), '', '', False, short_stream) write_line(chr(0x2063), '', '', False, short_stream) write_line(chr(0x2064), '', '', False, short_stream) def convert_to_char(str: str) -> str: # str is 'Uddddd' or 'Uddddd-ddddd' str = str.split("U")[1] # strip leading 'U' answer = "" for char_str in str.split("-"): # FIX: need to add backslash is str becomes "" ch = chr(int(char_str, base=16)) # if (ch == '"' or ch == '\\'): # answer += "\\" answer += ch return answer def write_line(ch: str, latex: str, short: str, is_commented: bool, out_stream: TextIO): def hex_string(ch: str) -> str: comment = '' if ch == '\\\\' or ch == '\\"': comment = hex(ord(ch[1])) elif len(ch) == 1: comment = hex(ord(ch)) else: comment = "0" + ch[1:] return comment # if ord(ch) < 0x7F and len(latex) <= 1: # return # probably an ASCII char if ch == '"': ch = '\\"' elif ch == '\\': ch = '\\\\' elif ch == '\\127': ch = '\\x7F' elif ch == "Β°": latex = "Β°" # special case in their code short_space = '𝐖' if short.startswith('\\') and not short.endswith('}') and len(short) > 2 else '' long_space = '𝐖' if latex.startswith('\\') and not latex.endswith('}') and len(latex) > 2 else '' try: # write untranslated text latex = latex.replace('\\', '\\\\').replace('"', '\\"') short = short.replace('\\', '\\\\').replace('"', '\\"') comment = "#" if is_commented else "" if short == '': first_part_char = f'{comment} - "{ch}": [t: "{latex + long_space}"]' out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n') else: first_part_char = f'{comment} - "{ch}":' first_part_short = f'{comment} then: [t: "{short + short_space}"]' first_part_long = f'{comment} else: [t: "{latex + long_space}"]' out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n') out_stream.write(f'{comment} - test:\n') out_stream.write(f'{comment} if: "$LaTeX_UseShortName"\n') out_stream.write(f'{first_part_short}\n') out_stream.write(f'{first_part_long}\n') # not sure why, but this gives better alignment # write the translated dots # braille = ascii_to_euro_braille(latex + space) # if short == '': # first_part_char = f' - "{ch}": [t: "{braille}"]' # out_stream.write(f'{first_part_char:<40} # {hex_string(ch)} ({latex})\n') # else: # short_braille = ascii_to_euro_braille(short+space) # fix spacing # first_part_char = f' - "{ch}":' # first_part_short = f' else: [t: "{short_braille}"]' # first_part_long = f' then: [t: "{braille}"]' # out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n') # out_stream.write(' - test:\n') # out_stream.write(' if: "$LaTeX_UseShortName=\'True\'"\n') # out_stream.write(f'{first_part_long:<34} # {latex}\n') # not sure why, but this gives better alignment # out_stream.write(f'{first_part_short:<36} # {short}\n') except IOError: print(f"failed to write a line for ch='{ch}/{hex_string(ch)}'") def create_greek_letters(out_file: str): # the HTML file has rowspans in it -- hence the use of table extractor with open("greek-letters.txt", encoding='utf8') as in_stream: with open(out_file, 'w', encoding='utf8') as out_stream: all_entries = [] lines = in_stream.readlines() for line in lines: parts = line.split('\t') if parts[1].startswith('\\'): # ignore 'A', etc., which don't have latex commands all_entries.append((parts[0].strip(), parts[1].strip())) all_entries = sorted(all_entries) for unicode, latex in all_entries: write_line(unicode, latex, "", False, out_stream) def create_ascii_math(out_file: str): with open("ascii-math-symbols.js", encoding='utf8') as in_stream: with open(out_file, 'w', encoding='utf8') as out_stream: all_entries = [] lines = in_stream.readlines() json_as_str = '[' # weed out the comments for line in lines: if line.startswith('{'): json_as_str += line json_as_str += ']' ascii_math_data = json.loads(json_as_str) for entry in ascii_math_data: if entry['tag'] in ['mi', 'mo', 'mtext']: asscii_math = entry['input'] if entry['input'].isalpha(): asscii_math = '𝐖' + entry['input'] + '𝐖' all_entries.append((entry['output'], asscii_math)) all_entries = sorted(all_entries) # add in the ASCII chars (without them, unicode-full will get loaded) # first collect the ascii chars that have a representation defined_ascii_chars = set() for unicode, ascci_math in all_entries: if len(unicode) > 1: continue if ord(unicode) > 127: break defined_ascii_chars.add(ord(unicode)) # now add the ascii chars for i in range(0x20, 0x7F): if i not in defined_ascii_chars: all_entries.append((chr(i), chr(i))) all_entries = sorted(all_entries) print(f'#all_entries={len(all_entries)}') function_names = '' with open("temp.json", 'w', encoding='utf8') as temp_stream: for entry in ascii_math_data: if entry['tag'] in ['mi', 'mo', 'mtext']: if len(entry['output']) > 1: function_names += ', "' + entry['output'] + '"' if entry['output'] != entry['output']: print(f"input and output don't match: '{entry['output']}' != '{entry['output']}'") else: temp_stream.write(f"{entry}\n") print(f"function names:\n{function_names}\n") out_stream.write("\n---\n") for unicode, ascci_math in all_entries: if len(unicode) == 1: write_line(unicode, ascci_math.replace(' ', '𝐖'), "", False, out_stream) # write the invisible chars out out_stream.write('\n # invisible chars\n') write_line(chr(0x2061), '', '', False, out_stream) write_line(chr(0x2062), '', '', False, out_stream) write_line(chr(0x2063), '', '', False, out_stream) write_line(chr(0x2064), '', '', False, out_stream) # create a list of chars for lambda conversion def print_lambda_list(): chars_as_set = get_unicode_yaml_chars("../Rules/Braille/UEB/unicode.yaml", True) print(*sorted(chars_as_set), sep="\n") def missing_unicode_chars() -> None: """Make sure all the math chars in unicode.xml are listed in one of MathCAT's unicode files""" tree = ET.parse(r"c:\dev\mathml-refresh\xml-entities\unicode.xml") root: ET.Element = tree.getroot() print(f"Root='{root}") all_char_elements = root.find("charlist") if all_char_elements is None: print(r"Didn't find XML root in c:\dev\mathml-refresh\xml-entities\unicode.xml!") exit(1) all_unicode_math_chars = set() for char_element in all_char_elements: if char_element is None: print("char_element is None!") continue unicode_data = char_element.find("unicodedata") if unicode_data is None: continue mathclass = unicode_data.get("mathclass", default="none") if mathclass == "none" or mathclass == "A" or mathclass == "G": # Alphabetic and Glyph classes continue # if unicode_data.get("category", default="none") != "Sm": # continue ch = char_element.get("id") if ch is None: print('char_element.get("id") is None!') continue ch = convert_to_char(ch) if len(ch) > 1: continue all_unicode_math_chars.add(ch) print(f"#all_unicode_math_chars = {len(all_unicode_math_chars)}") mathcat_chars = get_unicode_yaml_chars("../Rules/Languages/en/unicode.yaml", True) \ .union(get_unicode_yaml_chars("../Rules/Languages/en/unicode-full.yaml", True)) print(f"#mathcat_chars = {len(mathcat_chars)}") missing_chars = all_unicode_math_chars.difference(mathcat_chars) print(f"#mathcat_chars = {len(missing_chars)}") with open("missing_chars.yaml", 'w', encoding='utf8') as out_stream: for ch in sorted(missing_chars): write_line(ch, '', '', False, out_stream) # create_unicode_from_list_of_symbols_html("euro-symbols2.yaml") # create_greek_letters("greek-letters.yaml") # extract_latex(r"c:\dev\mathml-refresh\xml-entities\unicode.xml") # create_ascii_math("ascii-math-unicode.yaml") # print_lambda_list() missing_unicode_chars()