#!/usr/bin/python # -*- coding: utf-8 -*- # vim: ts=4 ### # # Copyright (c) 2018-2019 Andalugeeks # Authors: # - Ksar Feui # - J. Félix Ontañón import re import random from exceptions import Exception from andaluh.defs import * # Regex compilation. # Words to ignore in the translitaration in escapeLinks mode. to_ignore_re = re.compile('|'.join([ r'(?:https?://)?(?:www\.)?(?:[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6})', # URLs, i.e. andaluh.es, www.andaluh.es, https://www.andaluh.es r'(?:@\w+\b)', # Mentions, i.e. @andaluh r'(?:#\w+\b)' # Hashtags, i.e. #andaluh ]), re.IGNORECASE|re.UNICODE) # Auxiliary functions def get_vowel_circumflex(vowel): # If no tilde, replace with circumflex if vowel and vowel in VOWELS_ALL_NOTILDE: i = VOWELS_ALL_NOTILDE.find(vowel) + 5 return VOWELS_ALL_NOTILDE[i: i+1][0] # If vowel with tilde, leave it as it is elif vowel and vowel in VOWELS_ALL_TILDE: return vowel # You shouldn't call this method with a non vowel else: raise AndaluhError('Not a vowel', vowel) def get_vowel_tilde(vowel): # If no tilde, replace with circumflex if vowel and vowel in VOWELS_ALL_NOTILDE: i = VOWELS_ALL_NOTILDE.find(vowel) return VOWELS_ALL_TILDE[i] # If vowel with tilde, leave it as it is elif vowel and vowel in VOWELS_ALL_TILDE: return vowel # You shouldn't call this method with a non vowel else: raise AndaluhError('Not a vowel', vowel) #TODO: This can be improved to perform replacement in a per character basis #NOTE: It assumes replacement_word to be already lowercase def keep_case(word, replacement_word): if word.islower(): return replacement_word elif word.isupper(): return replacement_word.upper() elif word.istitle(): return replacement_word.title() else: return replacement_word # EPA replacement functions def h_rules(text): """Supress mute /h/""" def replace_with_case(match): word = match.group(0) if word.lower() in H_RULES_EXCEPT.keys(): return keep_case(word, H_RULES_EXCEPT[word.lower()]) else: def replace_with_case(match): h_char = match.group(1) next_char = match.group(2) if next_char and h_char.isupper(): return next_char.upper() elif next_char and h_char.islower(): return next_char.lower() else: return '' return re.sub(ur'(? chiguagua text = re.sub(ur'(? cacagûete text = re.sub(ur'(? Çilófono roto if text[0] == "X": text = vaf.upper() + text[1:] if text[0] == "x": text = vaf + text[1:] # If the /ks/ sound is between vowels # Axila => Aççila | Éxito => Éççito text = re.sub(ur'(a|e|i|o|u|á|é|í|ó|ú)(x)(a|e|i|o|u|á|é|í|ó|ú)', replace_intervowel_with_case, text, flags=re.IGNORECASE|re.UNICODE) # Every word starting with /ks/ text = re.sub(ur'\b(x)', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def ch_rules(text): """Replacement rules for /∫/ (voiceless postalveolar fricative)""" text = re.sub(ur'(c)(h)', lambda match: u'x' if match.group(1).islower() else u'X', text, flags=re.IGNORECASE) return text def gj_rules(text, vvf=VVF): """Replacing /x/ (voiceless postalveolar fricative) with /h/""" def replace_h_with_case(match): word = match.group(0) if word.lower() in GJ_RULES_EXCEPT.keys(): return keep_case(word, GJ_RULES_EXCEPT[word.lower()]) else: #TODO: This is an AWFUL way of implementing replacement rules with exceptions. To be fixed. word = re.sub(ur'(g|j)(e|i|é|í)', lambda match: vvf + match.group(2) if match.group(1).islower() else vvf.upper() + match.group(2), word, flags=re.IGNORECASE|re.UNICODE) word = re.sub(ur'(j)(a|o|u|á|ó|ú)', lambda match: vvf + match.group(2) if match.group(1).islower() else vvf.upper() + match.group(2), word, flags=re.IGNORECASE|re.UNICODE) return word def replace_g_with_case(match): s = match.group('s') a = match.group('a') b = match.group('b') ue = match.group('ue') const = match.group('const') return s + a + keep_case(b, 'g') + ue + const text = re.sub(ur'\b(\w*?)(g|j)(e|i|é|í)(\w*?)\b', replace_h_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'\b(\w*?)(j)(a|o|u|á|ó|ú)(\w*?)\b', replace_h_with_case, text, flags=re.IGNORECASE|re.UNICODE) # GUE,GUI replacement text = re.sub(ur'(gu|gU)(e|i|é|í|E|I|É|Í)', ur'g\2', text) text = re.sub(ur'(Gu|GU)(e|i|é|í|E|I|É|Í)', ur'G\2', text) # GÜE,GÜI replacement text = re.sub(ur'(g|G)(ü)(e|i|é|í|E|I|É|Í)', ur'\1u\3', text) text = re.sub(ur'(g|G)(Ü)(e|i|é|í|E|I|É|Í)', ur'\1U\3', text) # buen / abuel / sabues => guen / aguel / sagues # TODO: I've the gut feeling the following two regex can be merged into one. text = re.sub(ur'(b)(uen)', lambda match: u'g' + match.group(2) if match.group(1).islower() else u'G' + match.group(2), text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'(?Ps?)(?Pa?)(?b)(?Pue)(?Pl|s)', replace_g_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def v_rules(text): """Replacing all /v/ (Voiced labiodental fricative) with /b/""" def replace_with_case(match): word = match.group(0) if word.lower() in V_RULES_EXCEPT.keys(): return keep_case(word, V_RULES_EXCEPT[word.lower()]) else: # NV -> NB -> MB (i.e.: envidia -> embidia) word = re.sub(ur'nv', lambda match: keep_case(match.group(0), 'mb'), word, flags=re.IGNORECASE|re.UNICODE) word = re.sub(ur'v', ur'b', word) word = re.sub(ur'V', ur'B', word) return word text = re.sub(ur'\b(\w*?)(v)(\w*?)\b', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def ll_rules(text): """Replacing /ʎ/ (digraph ll) with Greek Y for /ʤ/ sound (voiced postalveolar affricate)""" def replace_with_case(match): word = match.group(0) if word.lower() in LL_RULES_EXCEPT.keys(): return keep_case(word, LL_RULES_EXCEPT[word.lower()]) else: return re.sub(ur'(l)(l)', lambda match: 'Y' if match.group(1).isupper() else 'y', word, flags=re.IGNORECASE) text = re.sub(ur'\b(\w*?)(l)(l)(\w*?)\b', replace_with_case, text, flags=re.IGNORECASE) return text def l_rules(text): """Rotating /l/ with /r/""" text = re.sub( ur'(l)(b|c|ç|Ç|g|s|d|f|g|h|k|m|p|q|r|t|x|z)', lambda match: 'r' + match.group(2) if match.group(1).islower() else 'R' + match.group(2), text, flags=re.IGNORECASE ) return text def psico_pseudo_rules(text): """Drops /p/ for pseudo- or psico- prefixes""" def replace_psicpseud_with_case(match): ps_syllable = match.group(1) if ps_syllable[0] == u'p': return ps_syllable[1:] else: return ps_syllable[1].upper() + ps_syllable[2:] text = re.sub(ur'(psic|pseud)', replace_psicpseud_with_case, text, flags=re.IGNORECASE) return text def vaf_rules(text, vaf=VAF): """Replacing Voiceless alveolar fricative (vaf) /s/ /θ/ with EPA's ç/Ç""" def replace_with_case(match): l_char = match.group(1) next_char = match.group(2) if l_char.islower(): return vaf + next_char else: return vaf.upper() + next_char text = re.sub(ur'(z|s)(a|e|i|o|u|á|é|í|ó|ú|â|ê|î|ô|û)', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'(c)(e|i|é|í|ê|î)', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def digraph_rules(text): """Replacement of consecutive consonant with EPA VAF""" def replace_lstrst_with_case(match): vowel_char = match.group(1) lr_char = match.group(2) t_char = match.group(4) if lr_char == u'l': lr_char == 'r' elif lr_char == u'L': lr_char == 'R' else: pass return vowel_char + lr_char + t_char*2 def replace_bdnr_s_with_case(match): vowel_char = match.group(1) cons_char = match.group(2) s_char = match.group(3) digraph_char = match.group(4) if cons_char.lower() + s_char.lower() == u'rs': return vowel_char + cons_char + digraph_char*2 else: return get_vowel_circumflex(vowel_char) + digraph_char*2 def replace_transpost_with_case(match): init_char = match.group(1) vowel_char = match.group(2) cons_char = match.group(4) if cons_char.lower() == u'l': return init_char + get_vowel_circumflex(vowel_char) + cons_char + u'-' + cons_char else: return init_char + get_vowel_circumflex(vowel_char) + cons_char*2 def replace_l_with_case(match): vowel_char = match.group(1) digraph_char = match.group(3) return get_vowel_circumflex(vowel_char) + digraph_char + u'-' + digraph_char def replace_digraph_with_case(match): vowel_char = match.group(1) to_drop_char, digraph_char = match.group(2) return get_vowel_circumflex(vowel_char) + digraph_char*2 # intersticial / solsticio / superstición / cárstico => interttiçiâh / çorttiçio / çuperttiçión / cárttico text = re.sub(ur'(a|e|i|o|u|á|é|í|ó|ú)(l|r)(s)(t)', replace_lstrst_with_case, text, flags=re.IGNORECASE) # aerotransporte => aerotrâpporte | translado => trâl-lado | transcendente => trâççendente | postoperatorio => pôttoperatorio | postpalatal => pôppalatal text = re.sub(ur'(tr|p)(a|o)(ns|st)(b|c|ç|Ç|d|f|g|h|j|k|l|m|n|p|q|s|t|v|w|x|y|z)', replace_transpost_with_case, text, flags=re.IGNORECASE|re.UNICODE) # abstracto => âttrâtto | adscrito => âccrito | perspectiva => pêrppêttiba text = re.sub(ur'(a|e|i|o|u|á|é|í|ó|ú)(b|d|n|r)(s)(b|c|ç|Ç|d|f|g|h|j|k|l|m|n|p|q|s|t|v|w|x|y|z)', replace_bdnr_s_with_case, text, flags=re.IGNORECASE|re.UNICODE) # atlántico => âl-lántico | orla => ôl-la | adlátere => âl-látere | tesla => têl-la ... text = re.sub(ur'(a|e|i|o|u|á|é|í|ó|ú)(d|j|r|s|t|x|z)(l)', replace_l_with_case, text, flags=re.IGNORECASE|re.UNICODE) # General digraph rules. text = re.sub(ur'(a|e|i|o|u|á|é|í|ó|ú)(' + u'|'.join(DIGRAPHS) + ')', replace_digraph_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def word_ending_rules(text): def replace_d_end_with_case(match): unstressed_rules = { u'a':u'â', u'A':u'Â', u'á':u'â', u'Á':u'Â', u'e':u'ê', u'E':u'Ê', u'é':u'ê', u'É':u'Ê', u'i':u'î', u'I':u'Î', u'í':u'î', u'Í':u'Î', u'o':u'ô', u'O':u'Ô', u'ó':u'ô', u'Ó':u'Ô', u'u':u'û', u'U':u'Û', u'ú':u'û', u'Ú':u'Û' } stressed_rules = { u'a':u'á', u'A':u'Á', u'á':u'á', u'Á':u'Á', u'e':u'é', u'E':u'É', u'é':u'é', u'É':u'É', u'i':u'î', u'I':u'Î', u'í':u'î', u'Í':u'Î', u'o':u'ô', u'O':u'Ô', u'ó':u'ô', u'Ó':u'Ô', u'u':u'û', u'U':u'Û', u'ú':u'û', u'Ú':u'Û' } word = match.group(0) prefix = match.group(1) suffix_vowel = match.group(2) suffix_const = match.group(3) if word.lower() in WORDEND_D_RULES_EXCEPT.keys(): return keep_case(word, WORDEND_D_RULES_EXCEPT[word.lower()]) if any(s in prefix for s in (u'á',u'é',u'í',u'ó',u'ú',u'Á',u'É',u'Í',u'Ó',u'Ú')): return prefix + unstressed_rules[suffix_vowel] else: if suffix_vowel in (u'a',u'e',u'A',u'E',u'á',u'é',u'Á',u'É'): return prefix + stressed_rules[suffix_vowel] else: if suffix_const.isupper(): return prefix + stressed_rules[suffix_vowel] + 'H' else: return prefix + stressed_rules[suffix_vowel] + 'h' def replace_s_end_with_case(match): repl_rules = { u'a':u'â', u'A':u'Â', u'á':u'â', u'Á':u'Â', u'e':u'ê', u'E':u'Ê', u'é':u'ê', u'É':u'Ê', u'i':u'î', u'I':u'Î', u'í':u'î', u'Í':u'Î', u'o':u'ô', u'O':u'Ô', u'ó':u'ô', u'Ó':u'Ô', u'u':u'û', u'U':u'Û', u'ú':u'û', u'Ú':u'Û' } prefix = match.group(1) suffix_vowel = match.group(2) suffix_const = match.group(3) word = prefix + suffix_vowel + suffix_const if word.lower() in WORDEND_S_RULES_EXCEPT.keys(): return keep_case(word, WORDEND_S_RULES_EXCEPT[word.lower()]) elif suffix_vowel in (u'á',u'é',u'í',u'ó',u'ú',u'Á',u'É',u'Í',u'Ó',u'Ú'): if suffix_const.isupper(): return prefix + repl_rules[suffix_vowel] + 'H' else: return prefix + repl_rules[suffix_vowel] + 'h' else: return prefix + repl_rules[suffix_vowel] def replace_const_end_with_case(match): repl_rules = { u'a':u'â', u'A':u'Â', u'á':u'â', u'Á':u'Â', u'e':u'ê', u'E':u'Ê', u'é':u'ê', u'É':u'Ê', u'i':u'î', u'I':u'Î', u'í':u'î', u'Í':u'Î', u'o':u'ô', u'O':u'Ô', u'ó':u'ô', u'Ó':u'Ô', u'u':u'û', u'U':u'Û', u'ú':u'û', u'Ú':u'Û' } word = match.group(0) prefix = match.group(1) suffix_vowel = match.group(2) suffix_const = match.group(3) if word.lower() in WORDEND_CONST_RULES_EXCEPT.keys(): return keep_case(word, WORDEND_CONST_RULES_EXCEPT[word.lower()]) elif any(s in prefix for s in (u'á',u'é',u'í',u'ó',u'ú',u'Á',u'É',u'Í',u'Ó',u'Ú')): return prefix + repl_rules[suffix_vowel] else: if suffix_const.isupper(): return prefix + repl_rules[suffix_vowel] + 'H' else: return prefix + repl_rules[suffix_vowel] + 'h' def replace_eps_end_with_case(match): prefix = match.group(1) suffix_vowel = match.group(2) suffix_const = match.group(3) if any(s in prefix for s in (u'á',u'é',u'í',u'ó',u'ú',u'Á',u'É',u'Í',u'Ó',u'Ú')): if suffix_vowel.isupper(): return prefix + u'Ê' else: return prefix + u'ê' else: # Leave as it is. There shouldn't be any word with -eps ending withough accent. return prefix + suffix_vowel + suffix_const def replace_intervowel_d_end_with_case(match): prefix = match.group(1) suffix_vowel_a = match.group(2) suffix_d_char = match.group(3) suffix_vowel_b = match.group(4) ending_s = match.group('s') suffix = suffix_vowel_a + suffix_d_char + suffix_vowel_b + ending_s word = prefix + suffix if word.lower() in WORDEND_D_INTERVOWEL_RULES_EXCEPT.keys(): return keep_case(word, WORDEND_D_INTERVOWEL_RULES_EXCEPT[word.lower()]) elif not any(s in prefix for s in (u'á',u'é',u'í',u'ó',u'ú',u'Á',u'É',u'Í',u'Ó',u'Ú')): # Ending word -ada rules if suffix.lower() == u'ada': if suffix_vowel_b.isupper(): return prefix + u'Á' else: return prefix + u'á' # Ending word -ada rules if suffix.lower() == u'adas': return prefix + keep_case(suffix[:2], get_vowel_circumflex(suffix[0]) + u'h') # Ending word -ado rules elif suffix.lower() == u'ado': return prefix + suffix_vowel_a + suffix_vowel_b # Ending word -ados -idos -ídos rules elif suffix.lower() in (u'ados', u'idos', u'ídos'): return prefix + get_vowel_tilde(suffix_vowel_a) + get_vowel_circumflex(suffix_vowel_b) # Ending word -ido -ído rules elif suffix.lower() in (u'ido', u'ído'): if suffix_vowel_a.isupper(): return prefix + u'Í' + suffix_vowel_b else: return prefix + u'í' + suffix_vowel_b else: return word else: return word # Intervowel /d/ replacements text = re.sub(ur'\b(\w*?)(a|i|í|Í)(d)(o|a)(?Ps?)\b', replace_intervowel_d_end_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'\b(\w+?)(e)(ps)\b', replace_eps_end_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'\b(\w+?)(a|e|i|o|u|á|é|í|ó|ú)(d)\b', replace_d_end_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'\b(\w+?)(a|e|i|o|u|á|é|í|ó|ú)(s)\b', replace_s_end_with_case, text, flags=re.IGNORECASE|re.UNICODE) text = re.sub(ur'\b(\w+?)(a|e|i|o|u|á|é|í|ó|ú)(b|c|f|g|j|k|l|p|r|t|x|z)\b', replace_const_end_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def exception_rules(text): """Set of exceptions to the replacement algorithm""" def replace_with_case(match): word = match.group(1) replacement_word = ENDING_RULES_EXCEPTION[word.lower()] return keep_case(word, replacement_word) text = re.sub(ur'\b(' + u'|'.join(ENDING_RULES_EXCEPTION.keys()) + ur')\b', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text def word_interaction_rules(text): """Contractions and other word interaction rules""" def replace_with_case(match): prefix = match.group(1) l_char = match.group(2) whitespace_char = match.group(3) next_word_char = match.group(4) r_char = keep_case(l_char, u'r') return prefix + r_char + whitespace_char + next_word_char # Rotating word ending /l/ with /r/ if first next word char is non-r consonant text = re.sub(ur'\b(\w*?)(l)(\s)(b|c|ç|d|f|g|h|j|k|l|m|n|ñ|p|q|s|t|v|w|x|y|z)', replace_with_case, text, flags=re.IGNORECASE|re.UNICODE) return text # Main function def epa(text, vaf=VAF, vvf=VVF, escapeLinks=False, debug=False): rules = [ h_rules, x_rules, ch_rules, gj_rules, v_rules, ll_rules, l_rules, psico_pseudo_rules, vaf_rules, word_ending_rules, digraph_rules, exception_rules, word_interaction_rules ] if type(text) != unicode: text = unicode(text, 'utf-8') # Do not start transcription if the input is empty if not text: return text def transliterate(text): for rule in rules: if rule in [x_rules, vaf_rules]: text = rule(text, vaf) elif rule == gj_rules: text = rule(text, vvf) else: text = rule(text) if debug: print rule.func_name + ' => ' + text return text if escapeLinks: ignore = to_ignore_re.findall(text) # Words in the message not to transliterate words = to_ignore_re.split(text) # Spanish words in the message to transliterate if not ignore: tags = [] text = text else: # Replace words to ignore in the transliteration with randints tags = zip(map(lambda x: str(random.randint(1,999999999)), ignore), ignore) text = ''.join(reduce(lambda x, y: ''.join(x) + ''.join(y), zip(words, [x[0] for x in tags]))) if len(words) > len(ignore): text += words[-1] if debug: print 'escapeLinks => ' + text text_and = transliterate(text) for tag in tags: text_and = text_and.replace(tag[0], tag[1]) if debug: print 'unEscapeLinks => ' + text_and return text_and else: return transliterate(text) class AndaluhError(Exception): def __init__(self, message, errors): # Call the base class constructor with the parameters it needs super(AndaluhError, self).__init__(message) # Now for your custom code... self.errors = errors