# Copyright (C) 2022 Red Hat
# SPDX-License-Identifier: Apache-2.0

"""
This benchmark compares both python and rust runtime.
Though the process is quite different: the new implementation does more work to
provides a better tokenization.
In particular, instead of applying regex replacement on the whole line, the new code
tries to tokenize each words, using a recursive algorithm to break down composite words.
"""

import timeit
import logreduce_tokenizer

# A very simple approximation of the new process
import re
http_re = re.compile("http[^ ]*", re.IGNORECASE)
months_re = re.compile(
    "sunday|monday|tuesday|wednesday|thursday|friday|saturday|"
    "january|february|march|april|may|june|july|august|september|"
    "october|november|december", re.IGNORECASE
)
word_re = re.compile("[ \t]")
def native_process(line):
    result = ""
    for word in word_re.split(line):
        if http_re.match(word):
            result += "URL"
        elif months_re.match(word):
            result += "MONTH"
        else:
            result += word
        result += " "
    return result

# Bench function
data = open("LICENSE").readlines()
def bench(process):
    return timeit.timeit(lambda: [process(line) for line in data], number=1000) * 1000

py = bench(native_process)
print("Python   {:.0f}ms".format(py))
rs = bench(logreduce_tokenizer.process)
print("Rust     {:.0f}ms ({:.1f} times faster)".format(rs, py / rs))

import tokenizer
base = bench(tokenizer.Tokenizer.process)
print("Baseline {:.0f}ms".format(base))