# Translate unicode characters into the target language # This makes use of three sources: SRE's translations, MathPlayer's translations, and Google translate. import sys sys.stdout.reconfigure(encoding='utf-8') # The google translate is done via https://github.com/ffreemt/google-stranslate (pip install itranslate) # from itranslate import itranslate as translate # TRANSLATE_URL = "https://translate.google.us" # Old -- this fails due to gender issues (e.g, try translating "the" to Spanish). People mentioned fixes, but the project seems dead # The google translate is done via googletrans # Note: needed to use 'pip install googletrans==4.0.0-rc1' and there is some concern this package might go away from googletrans import Translator GoogleTranslate = Translator(service_urls=["translate.google.us"]) # Google allows up to 500K chars translation/month, so using a key likely would be free anyway # Got blocked trying to translated unicode-full.yaml (~8,100 text strings). # Some sites said ~450 requests in a short time will causes a block # Some other places said there is ~5,000 char limit to a block request. # There are ~15 chars/request on average # To speed things up and avoid getting blocked, two passes are taken: # 1. Go through the document and find all strings to translate and add them to a dictionary. # 2. Convert the dictionary to a list, break it into appropriate chunks, and translate each chunk, building up a new dictionary # 3. Process the file and decide on using MP, SRE, or Google for the text; add comments for the others # This results in unneeded translations because if MP and SRE agree, we shouldn't bother with the translation. # However, knowing what MP and SRE will use for translations requires knowing the 'ch' and that's more work. # The solution I've adopted is a bit ugly in that the two passes have some duplication. def translate_char(ch:str, ignore_ch: bool, en_text: str, mathplayer: dict, sre: dict, access8: dict, google: dict): mp_trans = mathplayer[ch] if ch in mathplayer else '' sre_trans = sre[ch] if ch in sre else '' access8_trans = access8[ch] if ch in access8 else '' # don't bother do the translation if mp and sre agree google_trans = '' # print(f"mp_trans/sre_trans/access8_trans: '{mp_trans}/{sre_trans}/{access8_trans}'") if ignore_ch or mp_trans != sre_trans or mp_trans == '': en_text = en_text.replace("eigh", "a").replace("Eigh", "A").replace("cap", "uppercase").replace("paren", "parenthesis") if ignore_ch: mp_trans = '' sre_trans = '' access8_trans = '' if len(en_text) > 1: google_trans = google[en_text] # print("Google Translation:('{}') = '{}'".format(en_text, google_trans)) else: google_trans = ch return (mp_trans, sre_trans, access8_trans, google_trans) import re TextToTranslate = re.compile('t: "([^"]+)"') def translate_char_line(ch: str, line:str, mathplayer: dict, sre: dict, access8: dict, google: dict): # using a closure for this is ugly result = {} def do_translate_char(match_obj): if match_obj: alternatives = [] ignore_ch = line.find('then:') >= 0 # usually an alternative to what mp and sre would say mp_trans, sre_trans, access8_trans, google_trans = translate_char(ch, ignore_ch, match_obj.group(1), mathplayer, sre, access8, google) # print("ch='{}', mp/sre/access8/google='{}'/'{}'/'{}'/'{}'\n".format(ch, mp_trans, sre_trans, access8_trans, google_trans)) if mp_trans == '' and sre_trans == '' and access8_trans == '' and google_trans == '': translation = match_obj.group(1) # found nothing (can this happen?) elif mp_trans == '' and sre_trans == '' and access8_trans == '': translation = google_trans alternatives.append("google translation") elif mp_trans == sre_trans and mp_trans != '': translation = mp_trans # skip the google translation because mp and sre agree elif google_trans == mp_trans and google_trans != '': translation = mp_trans if sre_trans: alternatives.append("SRE: '{}'".format(sre_trans)) elif google_trans == sre_trans and google_trans != '': translation = sre_trans if mp_trans: alternatives.append("MathPlayer: '{}'".format(mp_trans)) # at this point we know none of the options match and that at least one of mp or sre is non-empty elif sre_trans: translation = sre_trans if mp_trans: alternatives.append("MathPlayer: '{}'".format(mp_trans)) alternatives.append("google: '{}'".format(google_trans)) elif mp_trans: translation = mp_trans if sre_trans: alternatives.append("SRE: '{}'".format(sre_trans)) alternatives.append("google: '{}'".format(google_trans)) elif access8_trans: translation = access8_trans else: # only translation comes from google translation = google_trans result['original'] = match_obj.group(1) result['translation'] = translation result['alternatives'] = alternatives if line.find('divided by') != -1: print(f" divided by translation: {translation}") return 't: "{}"'.format(translation) else: return line return (line if line.lstrip().startswith('#') else TextToTranslate.sub(do_translate_char, line), result) # char defs take one of two forms: # single line: - "̇": [t: "dot above embellishment"] # 0x307 # multiple lines: # - "Α": # 0x391 # - test: # if: "($Blind or $Verbosity!='Terse')" # then: [t: "cap"] # - t: alpha CharDefStart = re.compile('[^-]*- "([^"])"') # we skip ranges such as A-Z # gather lines until a char def is found def get_next_char_def(lines: list): iStart = 1 while iStart < len(lines): if CharDefStart.match(lines[iStart]): return lines[:iStart] iStart += 1 return lines # last char definition def gather_words_in_char_def(lines: list, lang: str, mathplayer: dict, sre: dict, access8: dict, words_to_translate: set): def gather_words_for_text(ch: str, en_text:str, lang: str, mathplayer: dict, sre: dict, access8: dict, words_to_translate: set): mp_trans = mathplayer[ch] if ch in mathplayer else '' sre_trans = sre[ch] if ch in sre else '' access8_trans = access8[ch] if ch in access8 else '' # don't bother do the translation if mp and sre agree google_trans = '' # print("mp_trans/sre_trans: '{}/{}'".format(mp_trans,sre_trans)) if mp_trans != sre_trans or mp_trans == '': # note: ch == '' => mp_trans == '' en_text = en_text.replace("eigh", "a").replace("Eigh", "A").replace("cap", "uppercase").replace("paren", "parenthesis") if len(en_text) > 1: words_to_translate.add(en_text) ch_match = CharDefStart.match(lines[0]) ch = ch_match.group(1) if ch_match else '' for line in lines: en_text = TextToTranslate.search(line) # assumes only one "t:" per line if en_text: # if "then:" is present, it is usually an alternative to what mp and sre would say ch_for_line = '' if line.find('then:') else ch gather_words_for_text(ch_for_line, en_text.group(1), lang, mathplayer, sre, access8, words_to_translate) return words_to_translate # echo lines, substituting for any "t:" def process_char_def(lines: list, mathplayer: dict, sre: dict, access8: dict, google: dict, out_stream): match = CharDefStart.match(lines[0]) ch = match.group(1) if match else '' for line in lines: translated_line, details = translate_char_line(ch, line, mathplayer, sre, access8, google) if translated_line: # make comments that don't start a line mostly align i_comment_char = translated_line.find('#') if i_comment_char > 0 and translated_line.find('"#"') >= 0: # avoid considering '#' in the case of it being defined: - "#": [t: "number"] i_comment_char = translated_line.find('#', i_comment_char+1) comment = '' if i_comment_char > 0 and not(translated_line.lstrip().startswith('#')): comment = translated_line[i_comment_char+1:].rstrip() translated_line = translated_line[:i_comment_char-1] if 'alternatives' in details: alternatives = details['alternatives'] if details['original'] != details['translation']: alternatives.insert(0, "en: '{}'".format(details['original'])) if alternatives != []: comment += '\t(' + alternatives[0] for str in alternatives[1:]: comment += ", " + str comment += ')' if comment: translated_line = "{:<48s}\t# {}\n".format(translated_line.rstrip(), comment) # print("***{}\t# {}\n".format(translated_line.rstrip(), comment)) out_stream.write( (translated_line if ch else line) ) # run over the file and figure out what words need to be translated def collect_words_to_translate(file_to_translate: str, lang: str, mathplayer: dict, access8: dict, sre: dict): with open(file_to_translate, 'r', encoding='utf8') as in_stream: lines = in_stream.readlines() iLine = 0 words_to_translate = set() while iLine < len(lines): char_def_lines = get_next_char_def(lines[iLine:]) # print("\niLines={}\n{}".format(iLine, list(map(lambda l: l+"\n", char_def_lines)))) if len(char_def_lines) == 0: break gather_words_in_char_def(char_def_lines, lang, mathplayer, sre, access8, words_to_translate) iLine += len(char_def_lines) return words_to_translate # break up the words into chunks to make google translate happy (and to run faster) and return a dictionary of word: translation MAX_CHARS_IN_CHUNK = 4500 # 4500 sometimes failed (language code "no") # try to avoid google banning us TIMEOUT = 2 import time def translate_words(words_to_translate: set[str], lang): if lang == 'nb' or lang == 'nn': lang = 'no' # google doesn't know those variants, but SRE uses them translations = {} def do_translation_chunk(words: set[str]): # translate doesn't handle a list properly -- use ".\n" to separate words word_string = ".\n".join(list(words)) # chunk_translations = translate(words, from_lang='en', to_lang=lang, url=TRANSLATE_URL) translated_words_str: str = GoogleTranslate.translate(word_string, src='en', dest=lang).text.lower() # Chinese has "." translated to "。" translated_words_str = translated_words_str.replace('。', '.') translated_words = translated_words_str.split('.\n') if len(translated_words) != len(words_to_translate): print("\n!!!Problem in translation: size of translations ({}) differs from words to translate ({})\n" .format(len(translated_words), len(words_to_translate))) # The Finnish translation (at least) for some reason has a few failures where ".\n" is only "\n" (and translation failed) # We try a last attempt by deleting the '.' and splitting at the newline print("Retrying by assuming '.' is missing...") translated_words = translated_words_str.replace('.', '').split('\n') if len(translated_words) != len(words_to_translate): print("!!!Retry failed: size of translations ({}) differs from words to translate ({})\n" .format(len(translated_words), len(words_to_translate))) print("Words to translate:\n{}".format(list(words_to_translate))) print("Translations:\n{}".format(list(translated_words))) for (orig, translation) in zip(words, translated_words): translations[orig] = translation word_list = list(words_to_translate) word_list.sort() # make deterministic for debugging char_count = 0 words_to_translate = [] for word in word_list: words_to_translate.add(word) char_count += len(word) if char_count >= MAX_CHARS_IN_CHUNK: do_translation_chunk(words_to_translate) print("Translated {} words...".format(len(words_to_translate))) char_count = 0 words_to_translate = [] time.sleep(TIMEOUT) # try to avoid google banning us do_translation_chunk(words_to_translate) return translations def create_new_file(file_to_translate: str, output_file: str, mathplayer: dict, sre: dict, access8: dict, google: dict): with open(file_to_translate, 'r', encoding='utf8') as in_stream: with open(output_file, 'w', encoding='utf8') as out_stream: lines = in_stream.readlines() iLine = 0 while iLine < len(lines): char_def_lines = get_next_char_def(lines[iLine:]) # print("\niLines={}\n{}".format(iLine, list(map(lambda l: l+"\n", char_def_lines)))) if len(char_def_lines) == 0: break process_char_def(char_def_lines, mathplayer, sre, access8, google, out_stream) iLine += len(char_def_lines) def build_new_translation(path_to_mathcat: str, lang: str, unicode_file_name: str): sre = get_sre_unicode_dict(SRE_Location, lang) mathplayer = get_mathplayer_unicode_dict(MP_Location, lang) access8 = get_access8_unicode_dict(ACCESS8_Location, lang) file_lang_to_translate = lang if lang == 'vi' or lang == 'id' else 'en' # these are already partially translated file_to_translate = "{}/Rules/Languages/{}/{}.yaml".format(path_to_mathcat, file_lang_to_translate, unicode_file_name) words_to_translate = collect_words_to_translate(file_to_translate, lang, mathplayer, access8, sre) google = translate_words(words_to_translate, lang) print("Translations: MathPlayer={}, SRE={}, Access8={}, Google={}".format(len(mathplayer), len(sre), len(access8), len(google))) create_new_file(file_to_translate, "{}/{}.yaml".format(lang, unicode_file_name), mathplayer, sre, access8, google) import os import json def get_sre_unicode_dict(path: str, lang: str): try: dict = {} full_path = path + "\\" + lang + "\\" + "symbols" + "\\" for filename in os.listdir(full_path): with open(full_path+filename, 'r', encoding='utf8') as in_stream: # print( "\nReading file {}".format(path+filename) ) sre_data = json.load(in_stream) for sre_entry in sre_data: # entries we care about look like {"key": "2212", "mappings": {"default": {"default": "menos"}}} if "key" in sre_entry and "default" in sre_entry["mappings"]: key = chr(int(sre_entry["key"], base=16)) dict[key] = sre_entry["mappings"]["default"]["default"] return dict except OSError: print(f"SRE not present: lang={lang}") lang_parts = lang.split('-') return {} if len(lang_parts) == 1 else get_sre_unicode_dict(path, lang_parts[0]) # entries we care about look like char ? (unicode == 0x2212) => string{text="menos";}; # or char ? (unicode == 0x004E) => string{text= "n"+(::target_group!="Blind" ? "" : " majuscule");};; MP_Pattern = re.compile(r'.*?\(unicode == 0x([0-9A-Fa-f]{4,5})\).*?"([^"]+)".*?') def get_mathplayer_unicode_dict(path: str, lang: str): full_path = path + "\\" + lang + "\\" print(f"MathPlayer path='{full_path}") try: dict = {} with open(full_path+"unicode.tdl", 'r', encoding='utf8') as in_stream: lines = in_stream.readlines() print(f" #lines={len(lines)}") for line in lines: matches = MP_Pattern.match(line) if matches: int_key = int(matches.group(1), base=16) text = matches.group(2).strip() # MP makes use of char in the private use area: E000—F8FF -- don't add those # Also, there's a lot of stuff in the 'zh' translation that isn't Chinese, so skip that if (int_key < 0xE000 or int_key > 0xF8FF) and text and not(lang.startswith('zh') and text.isascii()): key = chr(int_key) dict[key] = text print(f"dict entries = {len(dict)}") return dict except OSError: print(f"MathPlayer not found: lang={lang}") lang_parts = lang.split('-') return {} if len(lang_parts) == 1 else get_mathplayer_unicode_dict(path, lang_parts[0]) # entries we care about look like "∀\tfor all", where we need to make sure the first entry is a single char Access8_Pattern = re.compile(r'^(.)\t(.+)$') def get_access8_unicode_dict(path: str, lang: str): full_path = path + "\\" + lang.replace('-', '_') + "\\" print(f"Access8Math path='{full_path}") try: dict = {} with open(full_path+"unicode.dic", 'r', encoding='utf8') as in_stream: lines = in_stream.readlines() print(f" #lines={len(lines)}") for line in lines: matches = Access8_Pattern.match(line) if matches: key = matches.group(1) text = matches.group(2).strip() dict[key] = text print(f"dict entries = {len(dict)}") return dict except OSError: print(f"Access8 not found: lang={lang}") lang_parts = lang.split('-') return {} if len(lang_parts) == 1 else get_mathplayer_unicode_dict(path, lang_parts[0]) # for some diagnostics (from stackoverflow.com) def dict_compare(lang: str, sre: dict, mp: dict): sre_keys = set(sre.keys()) mp_keys = set(mp.keys()) shared_keys = sre_keys.intersection(mp_keys) sre_only = sre_keys - mp_keys mp_only = mp_keys - sre_keys differ = {o : (sre[o], mp[o]) for o in shared_keys if sre[o] != mp[o]} same = set(o for o in shared_keys if sre[o] == mp[o]) with open("diffs-{}.txt".format(lang), 'w', encoding='utf8') as out_stream: def print_dict(name, dict): out_stream.write("\n\n---{}---\n".format(name)) for key in dict: out_stream.write(" {}({:0>4x})={}\n".format(key, ord(key), dict[key])) def print_set(name, set, orig_dict): out_stream.write("\n---{}---\n".format(name)) for key in set: out_stream.write(" {}({:0>4x})='{}'\n".format(key, ord(key), orig_dict[key])) out_stream.write("sre/mp #chars={}/{}, #same={}, #differ={}, #only sre/mp={}/{}" .format(len(sre), len(mp), len(same), len(differ), len(sre_only), len(mp_only) )) print_dict("differ", differ) print_set("sre_only", sre_only, sre) print_set("mp_only", mp_only, mp) return (sre_only, mp_only, differ, same) # Google translate fails to lots of words when I use GoogleTranslate.translate() on a single def. # I played with changing the quotes and a few other things. The failures shifted, but the results weren't good. # Strangely, if you copy the '.../en/definitions.yaml' and paste that into translate.google.com, it does it all. # Rather than waste a bunch more time on this, the code assumes you've translated the file already and stored it in 'google-defs.yaml' in the current dir # It then goes through the English version leaving the English and pulling out only the translated *values* # from 'google-defs.yaml' writing '[lang]-definitions.yaml'. def translate_definitions(path_to_mathcat: str, lang: str): if lang == 'nb' or lang == 'nn': lang = 'no' # google doesn't know those variants file_to_translate = "{}/Rules/Languages/en/definitions.yaml".format(path_to_mathcat) with open("google-defs.yaml", 'r', encoding='utf8') as google_stream: translated_lines = google_stream.readlines() with open(file_to_translate, 'r', encoding='utf8') as in_stream: with open(f"{lang}/definitions.yaml", 'w', encoding='utf8') as out_stream: lines = in_stream.readlines() i = 0 n_lines = len(lines) while i < n_lines: if not(lines[i].startswith('#')) and lines[i].find(': [') >= 0: # handles 'xxx: [' inclusive of the line with the matching ']' i = translate_definition(i, lines, translated_lines, out_stream) else: out_stream.write(lines[i]) i += 1 def translate_definition(start: int, lines: list[str], translated_lines: list[str], out_stream): value = '' out_stream.write(lines[start]) i = start+1 # first line is 'name: [' while i < len(lines): if lines[i].find(']') >= 0: out_stream.write(lines[i]) return i out_stream.write(translated_lines[i].replace("“", "'").replace("”", "'").replace("、", ",")) # Chinese i += 1 return i def build_euro(lang: str): sre = get_sre_euro_dict() list(sre).sort() print(f"Translations: SRE={len(sre)}") with open("latex-braille-unicode.yaml", 'w', encoding='utf8') as out_stream: out_stream.write("---\n") for ch, braille in sre.items(): if ch == '"': ch = '\\"' elif ch == '\\': ch = '\\\\' elif ch == '\\127': ch = '\\x7F' first_part = f' - "{ch}": [t: "{braille}"]' try: comment = '' if ch == '\\\\' or ch == '\\"': comment = hex(ord(ch[1])) elif len(ch) == 1 or len(ch) == 2: comment = hex(ord(ch)) else: comment = "0" + ch[1:] out_stream.write('{:32}# {}\n'.format(first_part, comment)) except: print(f"failed to write a line for ch={ch}") def get_sre_euro_dict(): dict = {} full_path = SRE_Location + "\\" + "euro" + "\\" + "characters" + "\\" for filename in os.listdir(full_path): if filename == "Braille.json": continue with open(full_path+filename, 'r', encoding='utf8') as in_stream: print("\nReading file {}".format(full_path+filename) ) sre_data = json.load(in_stream) sre_data = sre_data[2] dict.update(sre_data) return dict def write_euro_braille_file(): file = open("EuroBraille-dict.txt", 'w', encoding='utf8') file.write('{\n') for key, value in get_sre_euro_dict().items(): if key == '"': key = '\\"' elif key == '\\': key = '\\\\' file.write(' "%s": "%s",\n' % (key, value)) file.write('}\n') file.close() # if os.path.exists("unicode.yaml"): # os.remove("unicode.yaml") SRE_Location = r"C:\Dev\speech-rule-engine\mathmaps" MP_Location = r"C:\Dev\mathplayer\EqnLib\rules\pvt" ACCESS8_Location = r"C:\dev\Access8Math\addon\globalPlugins\Access8Math\locale\speech" # (sre_only, mp_only, differ, same) = dict_compare("es", sre_chars, mp_chars) # (sre_only, mp_only, differ, same) = dict_compare("fr", get_sre_unicode_dict(SRE_Location, "fr"), get_mathplayer_unicode_dict(MP_Location, "fr")) # (sre_only, mp_only, differ, same) = dict_compare("it", get_sre_unicode_dict(SRE_Location, "it"), get_mathplayer_unicode_dict(MP_Location, "it")) language = "zh-cn" # build_new_translation("..", language, "unicode") # build_new_translation("..", language, "unicode-full") # see translate_definitions comments -- you need to manually copy the file to google translate. # translate_definitions("..", language) build_euro("euro")