// This file is part of cuniq. Copyright © 2024 cuniq contributors. // cuniq is licensed under the GNU GPL v3.0 or any later version. See LICENSE file for full text. //! Benchmarks for the built binary use std::fs::File; use std::path::PathBuf; use std::process::{Command, Stdio}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; // require certain features for this benchmark #[cfg(not(all(feature = "memmap")))] compile_error!("missing required features"); // Require that we don't have compile-time-rng, which would cause the binary and the bench to use different RNG seeds. #[cfg(feature = "compile-time-rng")] compile_error!("compile-time-rng feature should be disabled for benchmarking"); criterion_group!(benches, bench_cuniq_count_vs_shell, bench_cuniq_report_vs_shell); criterion_main!(benches); /// primary test condition for comparing high cardinality const TEST_FILE_ENGLISH_WORDS: TestFile = TestFile::new("hamlet_words.txt", "hamlet", 5414, 20); const TEST_FILE_SMALL: TestFile = TestFile::new("small.txt", "small", 3, 100); const TEST_FILE_LARGE: TestFile = TestFile::new("large.txt", "large", 100000, 10); /// Various different test conditions for comparing against different programs. /// Some of these are disabled as they don't provide much insight but they slow down the benchmarks. static TEST_FILES: &[TestFile] = &[ // TestFile::new("empty.txt", "c1 empty", 1, 20), // TestFile::new("same_line.txt", "c1", 1, 20), // TestFile::new("needle1.txt", "needle@start", 2, 20), // TestFile::new("needle2.txt", "needle@end", 2, 20), // TestFile::new("cardinality_10.txt", "c10", 10, 20), // TestFile::new("cardinality_100.txt", "c100", 100, 20), TestFile::new("cardinality_1000.txt", "c1e3", 1000, 20), TestFile::new("shuffled_numbers.txt", "c1e6", 1000000, 10), TEST_FILE_SMALL, TEST_FILE_ENGLISH_WORDS, TEST_FILE_LARGE, ]; struct TestFile { filename: &'static str, description: &'static str, expected: usize, sample_size: usize, } impl TestFile { const fn new(filename: &'static str, description: &'static str, expected: usize, sample_size: usize) -> Self { Self { filename, description, expected, sample_size, } } fn relative_path(&self) -> PathBuf { let mut path: PathBuf = PathBuf::from(env!("CARGO_MANIFEST_DIR")); path.pop(); path.push("test_files"); path.push(self.filename); path } } /// Benches cuniq counts vs other shell commands. /// TODO fix hardcoded paths fn bench_cuniq_count_vs_shell(c: &mut Criterion) { // get cuniq exe path let cuniq_path = env!("CARGO_BIN_EXE_cuniq"); println!("running benchmarks against \"{cuniq_path}\""); for test_file in TEST_FILES { let path_buf = test_file.relative_path(); let mut group = c.benchmark_group(format!("count/{}", test_file.description)); group.sample_size(test_file.sample_size); let expected = format!("{}\n", test_file.expected); // sort input.txt | uniq | wc -l group.bench_function("uniq", |bencher| { bencher.iter(|| { let sort = Command::new(r"C:\Program Files\Git\usr\bin\sort.exe") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let uniq = Command::new(r"C:\Program Files\Git\usr\bin\uniq.exe") .stdin(Stdio::from(sort.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(uniq.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // sort -u input.txt | wc -l group.bench_function("sort", |bencher| { bencher.iter(|| { let sort = Command::new(r"C:\Program Files\Git\usr\bin\sort.exe") .arg("-u") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(sort.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // cuniq input.txt group.bench_function("cuniq", |bencher| { bencher.iter(|| { let cuniq = Command::new(cuniq_path) .arg("--no-stdin") .arg("--memmap") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = cuniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // cuniq --mode=near-exact input.txt group.bench_function("cuniq-hash", |bencher| { bencher.iter(|| { let cuniq = Command::new(cuniq_path) .arg("--no-stdin") .arg("--memmap") .arg("--mode=near-exact") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = cuniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // sortuniq < input.txt // uses normal file-based stdin // note that sortuniq only supports stdin group.bench_function("sortuniq", |bencher| { bencher.iter(|| { let sortuniq = Command::new("sortuniq.exe") .stdin(Stdio::from(File::open(&path_buf).unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(sortuniq.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // runiq --filter=simple input.txt group.bench_function("runiq", |bencher| { bencher.iter(|| { let runiq = Command::new("runiq.exe") .arg("--filter=simple") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(runiq.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // runiq --filter=simple input.txt // note that runiq's default filter "quick" is theoretically vulnerable to hash collisions. group.bench_function("runiq-hash", |bencher| { bencher.iter(|| { let runiq = Command::new("runiq.exe") .arg("--filter=quick") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(runiq.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); // huniq < input.txt // uses normal file-based stdin // note that huniq only supports stdin // note that this is an unfair benchmark, as huniq only stores the hash group.bench_function("huniq", |bencher| { bencher.iter(|| { let huniq = Command::new("huniq.exe") .stdin(Stdio::from(File::open(&path_buf).unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let wc = Command::new(r"C:\Program Files\Git\usr\bin\wc.exe") .arg("-l") .stdin(Stdio::from(huniq.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = wc.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); assert_eq!(result, &expected); }); }); group.finish(); } } /// Benches cuniq reports vs other shell commands. /// TODO fix hardcoded paths fn bench_cuniq_report_vs_shell(c: &mut Criterion) { // get cuniq exe path let cuniq_path = env!("CARGO_BIN_EXE_cuniq"); println!("running benchmarks against \"{cuniq_path}\""); for test_file in TEST_FILES { let path_buf = test_file.relative_path(); let mut group = c.benchmark_group(format!("report/{}", test_file.description)); group.sample_size(test_file.sample_size); // sort input.txt | uniq -c group.bench_function("uniq", |bencher| { bencher.iter(|| { let sort = Command::new(r"C:\Program Files\Git\usr\bin\sort.exe") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let uniq = Command::new(r"C:\Program Files\Git\usr\bin\uniq.exe") .arg("-c") .stdin(Stdio::from(sort.stdout.unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = uniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); black_box(result); }); }); // cuniq -c input.txt group.bench_function("cuniq", |bencher| { bencher.iter(|| { let cuniq = Command::new(cuniq_path) .arg("--no-stdin") .arg("--memmap") .arg("--report") .arg(path_buf.as_os_str()) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = cuniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); black_box(result); }); }); // sortuniq -c < input.txt // uses normal file-based stdin // note that sortuniq only supports stdin group.bench_function("sortuniq", |bencher| { bencher.iter(|| { let sortuniq = Command::new("sortuniq.exe") .arg("-c") .stdin(Stdio::from(File::open(&path_buf).unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = sortuniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); black_box(result); }); }); // huniq -c < input.txt // uses normal file-based stdin // note that huniq only supports stdin group.bench_function("huniq", |bencher| { bencher.iter(|| { let huniq = Command::new("huniq.exe") .arg("-c") .stdin(Stdio::from(File::open(&path_buf).unwrap())) .stdout(Stdio::piped()) .spawn() .unwrap(); let output = huniq.wait_with_output().unwrap(); let result = std::str::from_utf8(&output.stdout).unwrap(); black_box(result); }); }); group.finish(); } }