# Source1: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py # Source2: https://github.com/huggingface/evaluate/blob/main/metrics/bleu/bleu.py # Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Python implementation of BLEU and smooth-BLEU. This module provides a Python implementation of BLEU and smooth-BLEU. Smooth BLEU is computed following the method outlined in the paper: Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic evaluation metrics for machine translation. COLING 2004. """ import collections import math from py_token import Tokenizer13a def _get_ngrams(segment, max_order): """Extracts all n-grams upto a given maximum order from an input segment. Args: segment: text segment from which n-grams will be extracted. max_order: maximum length in tokens of the n-grams returned by this methods. Returns: The Counter containing all n-grams upto max_order in segment with a count of how many times each n-gram occurred. """ ngram_counts = collections.Counter() for order in range(1, max_order + 1): for i in range(0, len(segment) - order + 1): ngram = tuple(segment[i:i+order]) ngram_counts[ngram] += 1 return ngram_counts def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False): """Computes BLEU score of translated segments against one or more references. Args: reference_corpus: list of lists of references for each translation. Each reference should be tokenized into a list of tokens. translation_corpus: list of translations to score. Each translation should be tokenized into a list of tokens. max_order: Maximum n-gram order to use when computing BLEU score. smooth: Whether or not to apply Lin et al. 2004 smoothing. Returns: 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram precisions and brevity penalty. """ matches_by_order = [0] * max_order possible_matches_by_order = [0] * max_order reference_length = 0 translation_length = 0 tokenizer = Tokenizer13a() reference_corpus = [[tokenizer(r) for r in ref] for ref in reference_corpus] translation_corpus = [tokenizer(p) for p in translation_corpus] for (references, translation) in zip(reference_corpus, translation_corpus): reference_length += min(len(r) for r in references) translation_length += len(translation) merged_ref_ngram_counts = collections.Counter() for reference in references: merged_ref_ngram_counts |= _get_ngrams(reference, max_order) translation_ngram_counts = _get_ngrams(translation, max_order) overlap = translation_ngram_counts & merged_ref_ngram_counts for ngram in overlap: matches_by_order[len(ngram)-1] += overlap[ngram] for order in range(1, max_order+1): possible_matches = len(translation) - order + 1 if possible_matches > 0: possible_matches_by_order[order-1] += possible_matches precisions = [0] * max_order for i in range(0, max_order): if smooth: precisions[i] = ((matches_by_order[i] + 1.) / (possible_matches_by_order[i] + 1.)) else: if possible_matches_by_order[i] > 0: precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) else: precisions[i] = 0.0 if min(precisions) > 0: p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) geo_mean = math.exp(p_log_sum) else: geo_mean = 0 ratio = float(translation_length) / reference_length if ratio > 1.0: bp = 1. else: bp = math.exp(1 - 1. / ratio) bleu = geo_mean * bp return bleu, precisions, bp, ratio, translation_length, reference_length if __name__ == "__main__": res = compute_bleu( reference_corpus=[["Hello, World!"]], translation_corpus=["Yellow, World!"], max_order=4, smooth=True, ) print(res)