// Copyright 2022 hjiayz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use regex::bytes::Regex;
use regexp2rust_macro::regex2rust;

const DNA: &'static str = include_str!("regexdna.txt");

regex2rust!( find_new_lines =  ">[^\n]*\n|\n" "ug" );
regex2rust!( variant1 = "agggtaaa|tttaccct" "ug" );
regex2rust!( variant2 = "[cgt]gggtaaa|tttaccc[acg]" "ug" );
regex2rust!( variant3 = "a[act]ggtaaa|tttacc[agt]t" "ug" );
regex2rust!( variant4 = "ag[act]gtaaa|tttac[agt]ct" "ug" );
regex2rust!( variant5 = "agg[act]taaa|ttta[agt]cct" "ug" );
regex2rust!( variant6 = "aggg[acg]aaa|ttt[cgt]ccct" "ug" );
regex2rust!( variant7 = "agggt[cgt]aa|tt[acg]accct" "ug" );
regex2rust!( variant8 = "agggta[cgt]a|t[acg]taccct" "ug" );
regex2rust!( variant9 = "agggtaa[cgt]|[acg]ttaccct" "ug" );
regex2rust!( subst1 = "B" "ug" );
regex2rust!( subst2 = "D" "ug" );
regex2rust!( subst3 = "H" "ug" );
regex2rust!( subst4 = "K" "ug" );
regex2rust!( subst5 = "M" "ug" );
regex2rust!( subst6 = "N" "ug" );
regex2rust!( subst7 = "R" "ug" );
regex2rust!( subst8 = "S" "ug" );
regex2rust!( subst9 = "V" "ug" );
regex2rust!( subst10 = "W" "ug" );
regex2rust!( subst11 = "Y" "ug" );
regex2rust!( subst12 = "tttaccca|tttac" "ug" );

fn regex2rustbench(s: &str) {
    assert_eq!(find_new_lines::Iter::new(s, 0).count(), 83337);
    assert_eq!(variant1::Iter::new(s, 0).count(), 32);
    assert_eq!(variant2::Iter::new(s, 0).count(), 115);
    assert_eq!(variant3::Iter::new(s, 0).count(), 368);
    assert_eq!(variant4::Iter::new(s, 0).count(), 254);
    assert_eq!(variant5::Iter::new(s, 0).count(), 466);
    assert_eq!(variant6::Iter::new(s, 0).count(), 135);
    assert_eq!(variant7::Iter::new(s, 0).count(), 137);
    assert_eq!(variant8::Iter::new(s, 0).count(), 139);
    assert_eq!(variant9::Iter::new(s, 0).count(), 197);
    assert_eq!(subst1::Iter::new(s, 0).count(), 29963);
    assert_eq!(subst2::Iter::new(s, 0).count(), 29987);
    assert_eq!(subst3::Iter::new(s, 0).count(), 30004);
    assert_eq!(subst4::Iter::new(s, 0).count(), 29974);
    assert_eq!(subst5::Iter::new(s, 0).count(), 29979);
    assert_eq!(subst6::Iter::new(s, 0).count(), 29959);
    assert_eq!(subst7::Iter::new(s, 0).count(), 30012);
    assert_eq!(subst8::Iter::new(s, 0).count(), 29994);
    assert_eq!(subst9::Iter::new(s, 0).count(), 29996);
    assert_eq!(subst10::Iter::new(s, 0).count(), 29984);
    assert_eq!(subst11::Iter::new(s, 0).count(), 29947);
    assert_eq!(subst12::Iter::new(s, 0).count(), 4584);
}

fn regexbench(s: &str, regexs: &[Regex]) {
    let s = s.as_bytes();

    assert_eq!(regexs[0].find_iter(s).count(), 83337);
    assert_eq!(regexs[1].find_iter(s).count(), 32);
    assert_eq!(regexs[2].find_iter(s).count(), 115);
    assert_eq!(regexs[3].find_iter(s).count(), 368);
    assert_eq!(regexs[4].find_iter(s).count(), 254);
    assert_eq!(regexs[5].find_iter(s).count(), 466);
    assert_eq!(regexs[6].find_iter(s).count(), 135);
    assert_eq!(regexs[7].find_iter(s).count(), 137);
    assert_eq!(regexs[8].find_iter(s).count(), 139);
    assert_eq!(regexs[9].find_iter(s).count(), 197);
    assert_eq!(regexs[10].find_iter(s).count(), 29963);
    assert_eq!(regexs[11].find_iter(s).count(), 29987);
    assert_eq!(regexs[12].find_iter(s).count(), 30004);
    assert_eq!(regexs[13].find_iter(s).count(), 29974);
    assert_eq!(regexs[14].find_iter(s).count(), 29979);
    assert_eq!(regexs[15].find_iter(s).count(), 29959);
    assert_eq!(regexs[16].find_iter(s).count(), 30012);
    assert_eq!(regexs[17].find_iter(s).count(), 29994);
    assert_eq!(regexs[18].find_iter(s).count(), 29996);
    assert_eq!(regexs[19].find_iter(s).count(), 29984);
    assert_eq!(regexs[20].find_iter(s).count(), 29947);
    assert_eq!(regexs[21].find_iter(s).count(), 4584);
}

fn criterion_benchmark(c: &mut Criterion) {
    let regexs = [
        ">[^\n]*\n|\n",
        "agggtaaa|tttaccct",
        "[cgt]gggtaaa|tttaccc[acg]",
        "a[act]ggtaaa|tttacc[agt]t",
        "ag[act]gtaaa|tttac[agt]ct",
        "agg[act]taaa|ttta[agt]cct",
        "aggg[acg]aaa|ttt[cgt]ccct",
        "agggt[cgt]aa|tt[acg]accct",
        "agggta[cgt]a|t[acg]taccct",
        "agggtaa[cgt]|[acg]ttaccct",
        "B",
        "D",
        "H",
        "K",
        "M",
        "N",
        "R",
        "S",
        "V",
        "W",
        "Y",
        "tttaccca|tttac",
    ]
    .map(|reg| Regex::new(reg).unwrap());
    c.bench_function("regex", |b| b.iter(|| regexbench(black_box(DNA), &regexs)));
    c.bench_function("regex2rust", |b| b.iter(|| regex2rustbench(black_box(DNA))));
}
const S: &'static str = include_str!("sherlock.txt");


macro_rules! sherlock {
    ($name:ident, $pattern:literal, $count:expr) => {

        fn $name(c: &mut Criterion) {
            let reg = Regex::new($pattern).unwrap();
            c.bench_function(concat!("regex ",stringify!($name)), |b| b.iter(|| assert_eq!(reg.find_iter(S.as_bytes()).count(), $count)));
            c.bench_function(concat!("regex2rust ",stringify!($name)), |b| b.iter(|| assert_eq!(reg2rust::$name::Iter::new(S, 0).count(), $count)));
        }
        
    };
}

pub mod reg2rust{
    use regexp2rust_macro::regex2rust;
    regex2rust!( name_sherlock = r"Sherlock" "ug" );
regex2rust!( name_holmes = r"Holmes" "ug" );
regex2rust!( name_sherlock_holmes = r"Sherlock Holmes" "ug" );

// Like the above, except case insensitively. The prefix detector will extract
// multiple *cut* prefix literals for each of the following before hitting its
// limit. All of these should be able to use either memchr2 or memchr3.
// std C++ does not support inline modifier syntax
regex2rust!( name_sherlock_nocase = r"Sherlock" "uig" );
regex2rust!( name_holmes_nocase = r"Holmes" "uig" );
regex2rust!( name_sherlock_holmes_nocase = r"Sherlock Holmes" "uig" );

// Will quickly find instances of 'Sherlock', but then needs to fall back to
// the lazy DFA to process the Unicode aware `\s`.
regex2rust!( name_whitespace = r"Sherlock\s+Holmes" "ug" );

// Now try more variations on name matching.
// This one has two alternates that both start with 'S'. This should compile
// to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA.
regex2rust!( name_alt1 = r"Sherlock|Street" "ug" );
// This one doesn't have a common byte, but should still use Aho-Corasick and
// memchr2.
// Never enters lazy DFA.
regex2rust!( name_alt2 = r"Sherlock|Holmes" "ug" );
// Still using Aho-Corasick, but more patterns. Never enters lazy DFA but
// also can't use any memchr variant.
regex2rust!( name_alt3 = r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker" "ug" );
// Still using Aho-Corasick, but needs the lazy DFA.
regex2rust!( name_alt3_nocase = r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker" "uig");
// Should still use Aho-Corasick for the prefixes in each alternate, but
// we need to use the lazy DFA to complete it.
regex2rust!( name_alt4 = r"Sher[a-z]+|Hol[a-z]+" "ug" );
regex2rust!( name_alt4_nocase = r"Sher[a-z]+|Hol[a-z]+" "uig" );
// Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
regex2rust!( name_alt5 = r"Sherlock|Holmes|Watson" "ug" );
regex2rust!( name_alt5_nocase = r"Sherlock|Holmes|Watson" "uig" );

// How long does it take to discover that there's no match? In the first two
// cases, we detect the rarest byte in the literal to run memchr on. In the
// first, it's 'z' and in the second it's 'j'. The third case only has common
// letters, and is therefore slower.
regex2rust!( no_match_uncommon = r"zqj" "ug" );
regex2rust!( no_match_common = r"aqj" "ug" );
regex2rust!( no_match_really_common = r"aei" "ug" );

// Various twiddling on very common words. This tends to stress the constant
// overhead of actually reporting a match. (None of these actually enter any
// matching engines.)
regex2rust!( the_lower = r"the" "ug" );
regex2rust!( the_upper = r"The" "ug" );
regex2rust!( the_nocase = r"the" "uig" );

// Process whitespace after a very common word.
// Uses Boyer-Moore to find `the` and the lazy DFA for the rest.
regex2rust!( the_whitespace = r"the\s+\w+" "ug" );

// How fast can we match everything? This essentially defeats any clever prefix
// tricks and just executes the DFA across the entire input.
regex2rust!( everything_greedy = r".*" "ug" );

regex2rust!( everything_greedy_nl = r".*" "usg" );

// How fast can we match every letter? This also defeats any clever prefix
// tricks.
regex2rust!( letters = r"\p{L}" "ug" );

regex2rust!( letters_upper = r"\p{Lu}" "ug" );

regex2rust!( letters_lower = r"\p{Ll}" "ug" );

regex2rust!( words = r"\w+" "ug" );
//regex2rust!( words = r"\w+" "ug" ); // hmm, why does RE2 diverge here?

// Find complete words before Holmes. The `\w` defeats any prefix
// optimizations.
regex2rust!( before_holmes = r"\w+\s+Holmes" "ug" );

// Find complete words before Holmes. Both of the `\w`s defeat any prefix
// and suffix optimizations.
regex2rust!( before_after_holmes = r"\w+\s+Holmes\s+\w+" "ug" );

// Find Holmes co-occurring with Watson in a particular window of characters.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
regex2rust!( holmes_cochar_watson = r"Holmes.{0,25}Watson|Watson.{0,25}Holmes" "ug" );

// Find Holmes co-occurring with Watson in a particular window of words.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
//regex2rust!( holmes_coword_watson = r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes" "ug" );

// Finds all occurrences of Sherlock Holmes at the beginning or end of a line.
// The empty assertions defeat any detection of prefix literals, so it's the
// lazy DFA the entire way.
regex2rust!(line_boundary_sherlock_holmes = r"^Sherlock Holmes|Sherlock Holmes$" "umg");

// All words ending in `n`. This uses Unicode word boundaries, which the DFA
// can speculatively handle. Since this benchmark is on mostly ASCII text, it
// performs well here. A different benchmark with non-Western text would be
// more revealing since the search would be forced to fall back to an NFA
// simulation.
regex2rust!( word_ending_n = r"\b\w+n\b" "ug" );

// This is a real bad one for Rust's engine. This particular expression
// fills the state cache quite frequently, which results in a lot of churn.
// This can be made to go roughly the speed of PCRE by increasing the DFA cache
// size.
//
// Its only salvation is that the DFA realizes it's executing slowly, gives up
// quickly and falls back to the NFA algorithm.
//
// RE2 seems to do a worse job at this than Rust. So much so that it's slow
// enough to be annoying, so we disable it.
regex2rust!( repeated_class_negation = r"[a-q][^u-z]{13}x" "ug" );

// This defeats any prefix optimizations but triggers the reverse suffix
// optimization.
regex2rust!( ing_suffix = r"[a-zA-Z]+ing" "ug" );

// Similar to ing_suffix, but a little more complex by limiting the length
// of the word and making sure it's surrounded by whitespace. The trailing
// `\s` defeats the reverse suffix optimization.
//
// Onig does surprisingly well on this benchmark and yet does quite poorly on
// the ing_suffix benchmark. That one has me stumped.
regex2rust!( ing_suffix_limited_space = r"\s[a-zA-Z]{0,12}ing\s" "ug" );

}

// These patterns are all single string literals that compile down to a variant
// of Boyer-Moore w/ memchr. This also demonstrates the impact that the
// frequency of a match has on performance.
sherlock!(name_sherlock, r"Sherlock", 97);
sherlock!(name_holmes, r"Holmes", 461);
sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91);

// Like the above, except case insensitively. The prefix detector will extract
// multiple *cut* prefix literals for each of the following before hitting its
// limit. All of these should be able to use either memchr2 or memchr3.
// std C++ does not support inline modifier syntax
sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102);
sherlock!(name_holmes_nocase, r"(?i)Holmes", 467);
sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96);

// Will quickly find instances of 'Sherlock', but then needs to fall back to
// the lazy DFA to process the Unicode aware `\s`.
sherlock!(name_whitespace, r"Sherlock\s+Holmes", 97);

// Now try more variations on name matching.
// This one has two alternates that both start with 'S'. This should compile
// to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA.
sherlock!(name_alt1, r"Sherlock|Street", 158);
// This one doesn't have a common byte, but should still use Aho-Corasick and
// memchr2.
// Never enters lazy DFA.
sherlock!(name_alt2, r"Sherlock|Holmes", 558);
// Still using Aho-Corasick, but more patterns. Never enters lazy DFA but
// also can't use any memchr variant.
sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740);
// Still using Aho-Corasick, but needs the lazy DFA.
sherlock!(
    name_alt3_nocase,
    r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker",
    753
);
// Should still use Aho-Corasick for the prefixes in each alternate, but
// we need to use the lazy DFA to complete it.
sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582);
sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697);
// Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639);
sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650);

// How long does it take to discover that there's no match? In the first two
// cases, we detect the rarest byte in the literal to run memchr on. In the
// first, it's 'z' and in the second it's 'j'. The third case only has common
// letters, and is therefore slower.
sherlock!(no_match_uncommon, r"zqj", 0);
sherlock!(no_match_common, r"aqj", 0);
sherlock!(no_match_really_common, r"aei", 0);

// Various twiddling on very common words. This tends to stress the constant
// overhead of actually reporting a match. (None of these actually enter any
// matching engines.)
sherlock!(the_lower, r"the", 7218);
sherlock!(the_upper, r"The", 741);
sherlock!(the_nocase, r"(?i)the", 7987);

// Process whitespace after a very common word.
// Uses Boyer-Moore to find `the` and the lazy DFA for the rest.
sherlock!(the_whitespace, r"the\s+\w+", 5410);

// How fast can we match everything? This essentially defeats any clever prefix
// tricks and just executes the DFA across the entire input.
//sherlock!(everything_greedy, r".*", 13053);

//sherlock!(everything_greedy_nl, r"(?s).*", 1);

// How fast can we match every letter? This also defeats any clever prefix
// tricks.
sherlock!(letters, r"\p{L}", 447160);

sherlock!(letters_upper, r"\p{Lu}", 14180);

sherlock!(letters_lower, r"\p{Ll}", 432980);

//sherlock!(words, r"\w+", 109214);
//sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?

// Find complete words before Holmes. The `\w` defeats any prefix
// optimizations.
sherlock!(before_holmes, r"\w+\s+Holmes", 319);

// Find complete words before Holmes. Both of the `\w`s defeat any prefix
// and suffix optimizations.
sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137);

// Find Holmes co-occurring with Watson in a particular window of characters.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);

// Find Holmes co-occurring with Watson in a particular window of words.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
//sherlock!(
//    holmes_coword_watson,
//    r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes",
//    51
//);

// Finds all occurrences of Sherlock Holmes at the beginning or end of a line.
// The empty assertions defeat any detection of prefix literals, so it's the
// lazy DFA the entire way.
sherlock!(
    line_boundary_sherlock_holmes,
    r"(?m)^Sherlock Holmes|Sherlock Holmes$",
    34
);

// All words ending in `n`. This uses Unicode word boundaries, which the DFA
// can speculatively handle. Since this benchmark is on mostly ASCII text, it
// performs well here. A different benchmark with non-Western text would be
// more revealing since the search would be forced to fall back to an NFA
// simulation.
sherlock!(word_ending_n, r"\b\w+n\b", 8366);

// This is a real bad one for Rust's engine. This particular expression
// fills the state cache quite frequently, which results in a lot of churn.
// This can be made to go roughly the speed of PCRE by increasing the DFA cache
// size.
//
// Its only salvation is that the DFA realizes it's executing slowly, gives up
// quickly and falls back to the NFA algorithm.
//
// RE2 seems to do a worse job at this than Rust. So much so that it's slow
// enough to be annoying, so we disable it.
sherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142);

// This defeats any prefix optimizations but triggers the reverse suffix
// optimization.
sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);

// Similar to ing_suffix, but a little more complex by limiting the length
// of the word and making sure it's surrounded by whitespace. The trailing
// `\s` defeats the reverse suffix optimization.
//
// Onig does surprisingly well on this benchmark and yet does quite poorly on
// the ing_suffix benchmark. That one has me stumped.
sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);

criterion_group!(benches2, name_sherlock,
    name_holmes,
    name_sherlock_holmes,
    name_sherlock_nocase,
    name_holmes_nocase,
    name_sherlock_holmes_nocase,
    name_whitespace,
    name_alt1,
    name_alt2,
    name_alt3,
    name_alt3_nocase,
    name_alt4,
    name_alt4_nocase,
    name_alt5,
    name_alt5_nocase,
    no_match_uncommon,
    no_match_common,
    no_match_really_common,
    the_lower,
    the_upper,
    the_nocase,
    the_whitespace,
    //everything_greedy,
    //everything_greedy_nl,
    letters,
    letters_upper,
    letters_lower,
    //words,
    before_holmes,
    before_after_holmes,
    holmes_cochar_watson,
    //holmes_coword_watson,
    //line_boundary_sherlock_holmes,
    word_ending_n,
    //repeated_class_negation,
    ing_suffix,
    //ing_suffix_limited_space
);
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches,benches2);