# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py # Copyright 2020 SacreBLEU Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from functools import lru_cache class BaseTokenizer: """A base dummy tokenizer to derive from.""" def signature(self): """ Returns a signature for the tokenizer. :return: signature string """ return "none" def __call__(self, line): """ Tokenizes an input line with the tokenizer. :param line: a segment to tokenize :return: the tokenized line """ return line class TokenizerRegexp(BaseTokenizer): def signature(self): return "re" def __init__(self): self._re = [ # language-dependent part (assuming Western languages) (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), # tokenize period and comma unless preceded by a digit (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), # tokenize period and comma unless followed by a digit (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), # tokenize dash when preceded by a digit (re.compile(r"([0-9])(-)"), r"\1 \2 "), # one space only between words # NOTE: Doing this in Python (below) is faster # (re.compile(r'\s+'), r' '), ] @lru_cache(maxsize=2**16) def __call__(self, line): """Common post-processing tokenizer for `13a` and `zh` tokenizers. :param line: a segment to tokenize :return: the tokenized line """ for (_re, repl) in self._re: # print(line) line = _re.sub(repl, line) # no leading or trailing spaces, single space within words # return ' '.join(line.split()) # This line is changed with regards to the original tokenizer (seen above) to return individual words return line.split() class Tokenizer13a(BaseTokenizer): def signature(self): return "13a" def __init__(self): self._post_tokenizer = TokenizerRegexp() @lru_cache(maxsize=2**16) def __call__(self, line): """Tokenizes an input line using a relatively minimal tokenization that is however equivalent to mteval-v13a, used by WMT. :param line: a segment to tokenize :return: the tokenized line """ # language-independent part: line = line.replace("", "") line = line.replace("-\n", "") line = line.replace("\n", " ") if "&" in line: line = line.replace(""", '"') line = line.replace("&", "&") line = line.replace("<", "<") line = line.replace(">", ">") return self._post_tokenizer(f" {line} ") if __name__ == "__main__": tokenizer = Tokenizer13a() print(tokenizer("Hello\n-\n, "World!"))