#[macro_use] extern crate criterion; use criterion::Criterion; use std::collections::HashMap; use std::fs::read_to_string; use std::time::{Duration, Instant}; use tokenizers::models::unigram::Unigram; use tokenizers::models::unigram::UnigramTrainer; pub fn bench_train(c: &mut Criterion) { let trainer = UnigramTrainer::builder() .show_progress(false) .unk_token(Some("".into())) .build() .unwrap(); let mut model = Unigram::default(); let content = read_to_string("data/small.txt").unwrap(); let mut word_counts = HashMap::new(); content.split_whitespace().for_each(|word| { // This is important for the test of char vs u8 let word = format!("▁{word}"); *word_counts.entry(word).or_insert(0) += 1; }); let sentences: Vec<_> = word_counts .iter() .map(|(s, i)| (s.to_owned(), *i)) .collect(); c.bench_function("Unigram Train vocabulary (small)", |b| { b.iter_custom(|iters| { let mut duration = Duration::new(0, 0); for _i in 0..iters { let sentences = sentences.clone(); let start = Instant::now(); trainer.do_train(sentences, &mut model).unwrap(); duration = duration.checked_add(start.elapsed()).unwrap(); } duration }) }); let content = read_to_string("data/big.txt").unwrap(); // creating `medium` data, which is the first 25% of `data/big.txt` let content = String::from(&content[..(content.len() as f64 * 0.25) as usize]); let mut word_counts = HashMap::new(); content.split_whitespace().for_each(|word| { // This is important for the test of char vs u8 let word = format!("▁{word}"); *word_counts.entry(word).or_insert(0) += 1; }); let sentences: Vec<_> = word_counts .iter() .map(|(s, i)| (s.to_owned(), *i)) .collect(); c.bench_function("Unigram Train vocabulary (medium)", |b| { b.iter_custom(|iters| { let mut duration = Duration::new(0, 0); for _i in 0..iters { let sentences = sentences.clone(); let start = Instant::now(); trainer.do_train(sentences, &mut model).unwrap(); duration = duration.checked_add(start.elapsed()).unwrap(); } duration }) }); } criterion_group! { name = benches_train; config = Criterion::default().sample_size(10); targets = bench_train } criterion_main!(benches_train);