// SPDX-License-Identifier: MPL-2.0
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use rand::{Rng, SeedableRng};
use wikiwho::utils;
fn generate_input_split_into_paragraphs(length: u64) -> String {
// generate inputs from fixed seeds
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(length); /* define specific algorithm to ensure reproducibility */
let mut input = String::new();
for _ in 0..length {
input.push(rng.gen());
}
// add some expected values at random places
const VALUES: &[&str] = &[
"\r", "\n", "\r\n", "\n\n", "{|", "|}", "|-\n", "
", "", "
",
];
for _ in 0..(length / 10) {
let mut pos = rng.gen_range(0..input.len());
while !input.is_char_boundary(pos) {
pos = rng.gen_range(0..input.len());
}
let value = VALUES[rng.gen_range(0..VALUES.len())];
input.insert_str(pos, value);
}
input
}
fn bench_split_into_paragraphs(c: &mut Criterion) {
let mut group = c.benchmark_group("split_into_paragraphs");
for length in [500u64, 1000u64, 5000u64, 10000u64].into_iter() {
let input = generate_input_split_into_paragraphs(length);
group.bench_with_input(BenchmarkId::new("Naive", length), &input, |b, i| {
b.iter(|| utils::split_into_paragraphs_naive(i));
});
group.bench_with_input(BenchmarkId::new("Optimized", length), &input, |b, i| {
let mut scratch_buffers = (String::new(), String::new());
b.iter(|| {
utils::split_into_paragraphs_optimized(
i,
(&mut scratch_buffers.0, &mut scratch_buffers.1),
)
});
});
}
}
fn generate_input_split_into_sentences(length: u64) -> String {
// generate inputs from fixed seeds
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(length); /* define specific algorithm to ensure reproducibility */
let mut input = String::new();
for _ in 0..length {
input.push(rng.gen());
}
// add some expected values at random places
const VALUES: &[&str] = &[
" ", "\n", ". ", ", ", "; ", ": ", "? ", "! ", "//", "http", "", "[",
];
for _ in 0..(length / 10) {
let mut pos = rng.gen_range(0..input.len());
while !input.is_char_boundary(pos) {
pos = rng.gen_range(0..input.len());
}
let value = VALUES[rng.gen_range(0..VALUES.len())];
input.insert_str(pos, value);
}
input
}
fn bench_split_into_sentences(c: &mut Criterion) {
let mut group = c.benchmark_group("split_into_sentences");
for length in [100u64, 500u64, 1000u64, 5000u64].into_iter() {
let input = generate_input_split_into_sentences(length);
group.bench_with_input(BenchmarkId::new("Naive", length), &input, |b, i| {
b.iter(|| utils::split_into_sentences_naive(i));
});
group.bench_with_input(BenchmarkId::new("Optimized", length), &input, |b, i| {
let mut scratch_buffers = (String::new(), String::new());
b.iter(|| {
utils::split_into_sentences_optimized(
i,
(&mut scratch_buffers.0, &mut scratch_buffers.1),
)
});
});
}
}
fn generate_input_split_into_tokens(length: u64) -> String {
// generate inputs from fixed seeds
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(length); /* define specific algorithm to ensure reproducibility */
let mut input = String::new();
for _ in 0..length {
input.push(rng.gen());
}
// add some expected values at random places
const VALUES: &[&str] = &[
" ", "\n", "", "[[", "]]", "{{", "}}", "|", ".", ",", ";", ":", "?", "!", "-",
"_", "/", "\\", "(", ")", "[", "]", "{", "}", "*", "#", "@", "&", "=", "+", "%", "~", "$",
"^", "<", ">", "\"", "'", "´", "`", "¸", "˛", "’", "¤", "₳", "฿", "₵", "¢", "₡", "₢", "₫",
"₯", "֏", "₠", "€", "ƒ", "₣", "₲", "₴", "₭", "₺", "₾", "ℳ", "₥", "₦", "₧", "₱", "₰", "£",
"៛", "₽", "₹", "₨", "₪", "৳", "₸", "₮", "₩", "¥", "§", "‖", "¦", "⟨", "⟩", "–", "—", "¯",
"»", "«", "”", "÷", "×", "′", "″", "‴", "¡", "¿", "©", "℗", "®", "℠", "™",
];
for _ in 0..(length / 10) {
let mut pos = rng.gen_range(0..input.len());
while !input.is_char_boundary(pos) {
pos = rng.gen_range(0..input.len());
}
let value = VALUES[rng.gen_range(0..VALUES.len())];
input.insert_str(pos, value);
}
input
}
fn bench_split_into_tokens(c: &mut Criterion) {
let mut group = c.benchmark_group("split_into_tokens");
for length in [10u64, 50u64, 100u64, 500u64].into_iter() {
let input = generate_input_split_into_tokens(length);
group.bench_with_input(BenchmarkId::new("Naive", length), &input, |b, i| {
b.iter(|| utils::split_into_tokens_naive(i));
});
group.bench_with_input(BenchmarkId::new("Corasick", length), &input, |b, i| {
b.iter(|| utils::split_into_tokens_corasick(i));
});
}
}
fn generate_input_to_lowercase(ascii_ratio: f32) -> String {
const LENGTH: usize = 10000;
// generate inputs from fixed seeds
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(ascii_ratio.to_bits().into()); /* define specific algorithm to ensure reproducibility */
let mut input = String::new();
for _ in 0..LENGTH {
if rng.gen::() < ascii_ratio {
input.push(rng.gen_range(0u8..0x80u8) as char);
} else {
input.push(rng.gen());
}
}
input
}
fn bench_to_lowercase(c: &mut Criterion) {
let mut group = c.benchmark_group("split_into_tokens");
for ratio in [1.0, 0.99, 0.9, 0.5, 0.1].into_iter() {
let input = generate_input_to_lowercase(ratio);
group.bench_with_input(BenchmarkId::new("Naive", ratio), &input, |b, i| {
b.iter(|| i.to_lowercase());
});
group.bench_with_input(BenchmarkId::new("case-mapping", ratio), &input, |b, i| {
b.iter(|| utils::to_lowercase_opt(i));
});
}
}
criterion_group!(
benches,
bench_split_into_paragraphs,
bench_split_into_sentences,
bench_split_into_tokens,
bench_to_lowercase,
);
criterion_main!(benches);
]