""" Translate the English rule files (but not the unicode files) to the target language. This is done with the function build_all_translations(). The unicode files are not built here because they are large enough to seem to occasionally run into hiccups. See the end of this file how this is used (typically change 'language' and just run the file) """ import re import os import sys sys.stdout.reconfigure(encoding='utf-8') # Translate text in rules into the target language # The google translate is done via https://github.com/ffreemt/google-stranslate (pip install itranslate) # from itranslate import itranslate as translate # TRANSLATE_URL = "https://translate.google.us" # # The google translate is done via googletrans # Note: needed to use 'pip install googletrans==4.0.0-rc1' and there is some concern this package might go away from googletrans import Translator GoogleTranslate = Translator(service_urls=["translate.google.us"]) # Google allows up to 500K chars translation/month, so using a key likely would be free anyway # Unlike the unicode file, the rule files don't have a lot of text. # # To speed things up and avoid getting blocked, two passes are taken: # 1. For each file, we gather all the text into a list that has "phrase(..'xxx'...)". We prepend :: to the phrase string. # 2. Turn the list into a string with separators, translate it, and reconvert to a list # 3. Reread the file replacing translations (we know the line number) and writing it out PhraseToTranslate = re.compile(r'phrase\(([^)]+)\)') WordToTranslate = re.compile(r't: "([^"]+)"') # run over the file and figure out what words need to be translated def collect_phrases_to_translate(file_to_translate: str) -> (list[str], list[str]): with open(file_to_translate, 'r', encoding='utf8') as in_stream: phrases = [] words = [] for line in in_stream: phrase = PhraseToTranslate.search(line) if phrase: phrases.append(phrase.group(1)) word = WordToTranslate.search(line) if word: words.append(word.group(1)) print(f"#phrases={len(phrases)}, #words={len(words)}") return (phrases, words) # break up the words into chunks to make google translate happy (and to run faster) and return a dictionary of word: translation MAX_CHARS_IN_CHUNK = 4500 # 4500 sometimes failed (language code "no") # try to avoid google banning us TIMEOUT = 2 import time def translate_phrases(phrases_to_translate: list[str], lang) -> list[str]: if lang=='nb' or lang=='nn': lang = 'no' # google doesn't know those variants, but SRE uses them def do_translation_chunk(phrases: list[str]): if len(phrases) == 0: return phrases # file with no "phrase(...)" # translate doesn't handle a list properly -- use ".\n" to separate phrases phrases_string = ".\n".join(phrases) # print("***Phrases to translate: {}\n".format(phrases)) translated_phrases_str: str = GoogleTranslate.translate(phrases_string, src='en', dest=lang).text translated_phrases_str = translated_phrases_str.replace('。', '.') # happens for Chinese translated_phrases_str = translated_phrases_str.replace('"', "'").replace("“", "'").replace("”", "'") # google occasionally changes quotes translated_phrases_str = translated_phrases_str.replace("«", "'").replace("»", "'") # google occasionally changes quotes to this form translated_phrases_str = translated_phrases_str.replace("、", ",") # Chinese comma translated_phrases_str = translated_phrases_str.lower() translated_phrases_list = translated_phrases_str.split('.\n') if len(translated_phrases_list) != len(phrases): print("\n!!!Problem in translation: size of translations ({}) differs from phrases to translate ({})\n".format(len(translated_phrases_list), len(phrases))) print("English phrases: {}\n".format(phrases)) print("Truncated translated phrases: {}\n".format(translated_phrases_list)) # The Finnish translation (at least) for some reason has a few failures where ".\n" is only "\n" (and translation failed) # We try a last attempt by deleting the '.' and splitting at the newline print("Retrying by assuming '.' is missing...") translated_phrases_list = translated_phrases_str.replace('.','').split('\n') if len(translated_phrases_list) != len(phrases): print("!!!***Retry failed: size of translations ({}) differs from phrases to translate ({})\n".format(len(translated_phrases_list), len(phrases))) print("Phrases to translate:\n{}".format(list(phrases))) print("Translations:\n{}".format(list(translated_phrases_list))) return translated_phrases_list translations = [] char_count = 0 phrases_chunks_to_translate = [] for phrase in phrases_to_translate: phrases_chunks_to_translate.append(phrase) char_count += len(phrase) if char_count >= MAX_CHARS_IN_CHUNK: print("char_count={}", char_count) translations += do_translation_chunk(phrases_chunks_to_translate) print("Translated {} phrases...".format(len(phrases_chunks_to_translate))) char_count = 0 phrases_chunks_to_translate = [] time.sleep(TIMEOUT) # try to avoid google banning us return translations + do_translation_chunk(phrases_chunks_to_translate) TargetWord = re.compile(r"'([^']+)'") TextString = re.compile(r'([ \[{][oc]?t: )"([^"]+)"') def substitute_in_translated_phrase(line, translated_phrase, translated_word) -> str: has_phrase = PhraseToTranslate.search(line) target_words = TargetWord.search(translated_phrase) text_words = TextString.search(line) new_line = line if has_phrase and target_words and text_words: # test for text_words handles "variables: [....]" try: replacement = text_words.group(1) + '"' + target_words.group(1) + '"' # add the surrounding context back except AttributeError: print(f"text_words={text_words}, target_words={target_words}, line='{line}'") exit() new_line = TextString.sub(replacement, line) # print("fixed line: {}".format(new_line)) elif text_words: print(f"Failed to find quoted part in translation \"{translated_phrase}\", \ using '{translated_word}\n original line: {line}") replacement = text_words.group(1) + '"' + translated_word + '"' # add the surrounding context back new_line = TextString.sub(replacement, line) return new_line def create_new_file(file_to_translate: str, output_file: str, phrase_translations: list[str], word_translations: list[str]) -> None: with open(output_file, 'w', encoding='utf8') as out_stream: with open(file_to_translate, 'r', encoding='utf8') as in_stream: iPhraseTranslation = 0 iWordTranslation = 0 # need to add an extra element to both lists because the indexes are inc'd after last entry but could be more non-translation lines phrase_translations.append("dummy") word_translations.append("dummy") for line in in_stream: out_stream.write(substitute_in_translated_phrase( line, phrase_translations[iPhraseTranslation], word_translations[iWordTranslation])) if PhraseToTranslate.search(line): iPhraseTranslation += 1 if WordToTranslate.search(line): iWordTranslation += 1 def build_new_translation(path_to_mathcat: str, lang: str, rule_file_name: str) -> None: print("build_new_translation: rule_file_name=", rule_file_name) file_to_translate = "{}/Rules/Languages/en/{}".format(path_to_mathcat, rule_file_name) (phrases_to_translate, words_to_translate) = collect_phrases_to_translate(file_to_translate) phrase_translations = translate_phrases(phrases_to_translate, lang) word_translations = translate_phrases(words_to_translate, lang) print(f"file:{rule_file_name}: #phrases={len(phrase_translations)}, #words={len(word_translations)}") create_new_file(file_to_translate, os.path.join(lang, rule_file_name), phrase_translations, word_translations) print("done\n") def build_all_translations(path_to_mathcat: str, lang: str, subdir="") -> None: dir_to_translate = os.path.join(path_to_mathcat, "Rules", "Languages", "en", subdir) entries = os.listdir(dir_to_translate) for entry in entries: if os.path.isdir(os.path.join(dir_to_translate, entry)): build_all_translations(path_to_mathcat, lang, os.path.join(subdir, entry)) elif entry.endswith('.yaml') and not(entry=="definitions.yaml" or entry=="unicode.yaml" or entry=="unicode-full.yaml"): # the excluded files are built in translate-unicode.py and need some manual checking so not included here build_new_translation(path_to_mathcat, lang, os.path.join(subdir, entry)) language = 'el' if not os.path.exists(language): os.makedirs(language) if not os.path.exists(language+"/SharedRules"): os.makedirs(language+"/SharedRules") # build_new_translation("..", language, "ClearSpeak_Rules.yaml") build_all_translations("..", language)