//! Validation against bedtools use granges::{ commands::granges_random_bed, io::parsers::bed::bed_missing, prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator}, ranges::GenomicRangeRecord, test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, Position, }; use indexmap::IndexMap; use std::{ fs::File, path::PathBuf, process::{Command, Stdio}, }; use serde::Deserialize; /// A macro to ensure that standard out is the same. In some cases /// this cannot be used, e.g. bedtools and granges have different /// output float precision, so validation in that case is more involved. /// Importantly, this also checks that standard out has a length > 0, /// which can be a sneaky edge case where output "matches" but is not /// valid since the comparison will pass spuriously. macro_rules! assert_stdout_eq { ($left:expr, $right:expr) => { let left_output = String::from_utf8_lossy(&$left.stdout); let right_output = String::from_utf8_lossy(&$right.stdout); // Check that lengths are greater than 0 assert!( !left_output.is_empty() && !right_output.is_empty(), "One or both outputs are empty" ); // Check that the outputs are equal assert_eq!(left_output, right_output, "Outputs are not equal"); }; } fn validate_bedfloats( bedtools_path: impl Into, granges_path: impl Into, genome: &IndexMap, tol: f64, context: Option, ) { let bedtools_path = bedtools_path.into(); let granges_path = granges_path.into(); let bedtools_iter = BedlikeIterator::new(bedtools_path).unwrap(); let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); let granges_iter = BedlikeIterator::new(granges_path).unwrap(); let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); let granges_data = granges_gr.take_data().unwrap(); let granges_data = granges_data.iter().map(|extra_cols| { // explicitly handle missing, then unwrap (rather than ok(), which is less safe) if *extra_cols == Some(".".to_string()) { return None; } let score: Result = extra_cols.as_ref().unwrap().parse(); // dbg!(&extra_cols); Some(score.unwrap()) }); let bedtools_data = bedtools_gr.take_data().unwrap(); let bedtools_data = bedtools_data.iter().map(|extra_cols| { if *extra_cols == Some(".".to_string()) { return None; } let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap(); Some(score) }); assert_eq!(granges_data.len(), bedtools_data.len()); granges_data .zip(bedtools_data) .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) { (Some(gr), Some(bd)) => assert!((gr - bd).abs() < tol), // NOTE: for some sum operations with no data, // bedtools returns '.' not 0.0. The latter is more correct // (the sum of the empty set is not NA, it's 0.0). // This is a shim so tests don't stochastically break // in this case. (Some(n), None) if n == 0.0 => (), (None, None) => (), _ => panic!("{:?}", (&context, &gr_val, &bd_val)), }); } // adjustable test sizes -- useful also for making very small // so string diffs are easier to compare. #[cfg(not(feature = "bench-big"))] const BED_LENGTH: usize = 100_000; #[cfg(feature = "bench-big")] const BED_LENGTH: usize = 1_000_000; // -- some validation helpers #[macro_export] macro_rules! assert_float_tol { ($a:expr, $b:expr) => {{ let tol = 1e-5; assert!( ($a - $b).abs() < tol, "assertion failed: `(left !== right)`\n left: `{:?}`, right: `{:?}`, tolerance: `{:?}`", $a, $b, tol ); }}; } #[macro_export] macro_rules! assert_option_float_tol { ($a:expr, $b:expr) => {{ let tol = 1e-5; match ($a, $b) { (Some(a_val), Some(b_val)) => { assert!( (a_val - b_val).abs() < tol, "assertion failed: `(left !== right)`\n left: `{:?}`, right: `{:?}`, tolerance: `{:?}`", a_val, b_val, tol ); } (None, None) => (), // Both are None, so do nothing. _ => panic!( "assertion failed: one is None and the other is Some\n left: `{:?}`, right: `{:?}`", $a, $b ), } }}; } // -- main validation tests below -- #[test] fn test_random_bed3file_filetype_detect() { let random_bedfile_path = temp_bedfile().path().to_path_buf(); granges_random_bed( "tests_data/hg38_seqlens.tsv", BED_LENGTH, Some(&random_bedfile_path), true, false, ) .expect("could not generate random BED file"); // head_file(&random_bedfile_path); match GenomicRangesFile::detect(random_bedfile_path).unwrap() { GenomicRangesFile::Bed3(_) => (), _ => panic!(": could not detect correct filetype"), } } #[test] fn test_against_bedtools_slop() { let random_bedfile = temp_bedfile(); let random_bedfile_path = random_bedfile.path(); granges_random_bed( "tests_data/hg38_seqlens.tsv", BED_LENGTH, Some(&random_bedfile_path), true, false, ) .expect("could not generate random BED file"); let width = 10; let bedtools_output = Command::new("bedtools") .arg("slop") .arg("-g") .arg("tests_data/hg38_seqlens.tsv") .arg("-b") .arg(width.to_string()) .arg("-i") .arg(&random_bedfile_path) .output() .expect("bedtools slop failed"); let granges_output = Command::new(granges_binary_path()) .arg("adjust") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--both") .arg(width.to_string()) .arg("--sort") .arg(&random_bedfile_path) .output() .expect("granges adjust failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); assert_stdout_eq!(bedtools_output, granges_output); } /// Test bedtools intersect -a -b -wa -u /// against /// granges filter --genome --left --right #[test] fn test_against_bedtools_intersect_wa() { let num_ranges = 100_000; let random_bedfile_left_tempfile = random_bed3file(num_ranges); let random_bedfile_right_tempfile = random_bed3file(num_ranges); let random_bedfile_left = random_bedfile_left_tempfile.path(); let random_bedfile_right = random_bedfile_right_tempfile.path(); // for testing: uncomment and results are local for inspection // let random_bedfile_left = Path::new("test_left.bed"); // let random_bedfile_right = Path::new("test_right.bed"); granges_random_bed( "tests_data/hg38_seqlens.tsv", num_ranges, Some(&random_bedfile_right), true, false, ) .expect("could not generate random BED file"); let bedtools_output = Command::new("bedtools") .arg("intersect") .arg("-a") .arg(&random_bedfile_left) .arg("-b") .arg(&random_bedfile_right) .arg("-wa") .arg("-u") .output() .expect("bedtools intersect failed"); let granges_output = Command::new(granges_binary_path()) .arg("filter") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg(&random_bedfile_left) .arg("--right") .arg(&random_bedfile_right) .output() .expect("granges adjust failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); assert_stdout_eq!(bedtools_output, granges_output); } /// Test bedtools flank -g -i -l 10 -r 20 /// against /// granges filter --genome --left 10 --right 20 #[test] fn test_against_bedtools_flank() { let num_ranges = 1_000; let random_bedfile_tempfile = random_bed3file(num_ranges); let random_bedfile = random_bedfile_tempfile.path(); granges_random_bed( "tests_data/hg38_seqlens.tsv", num_ranges, Some(&random_bedfile), true, false, ) .expect("could not generate random BED file"); let bedtools_output = Command::new("bedtools") .arg("flank") .arg("-g") .arg("tests_data/hg38_seqlens.tsv") .arg("-l") .arg("20") .arg("-r") .arg("30") .arg("-i") .arg(&random_bedfile) .output() .expect("bedtools flank failed"); let granges_output = Command::new(granges_binary_path()) .arg("flank") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg("20") .arg("--right") .arg("30") .arg(&random_bedfile) .output() .expect("granges flank failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); let bedtools_str = String::from_utf8_lossy(&bedtools_output.stdout); let granges_str = String::from_utf8_lossy(&granges_output.stdout); assert!(bedtools_str.len() > 0); assert!(granges_str.len() > 0); let mut bedtools_ranges: Vec<_> = bedtools_str.split("\n").collect(); let mut granges_ranges: Vec<_> = granges_str.split("\n").collect(); bedtools_ranges.sort(); granges_ranges.sort(); assert_eq!(bedtools_ranges, granges_ranges); } #[test] fn test_against_bedtools_makewindows() { // some weird widths, steps to try to catch remainder issues let widths = vec![131123, 1_000_0013]; let steps = vec![10_001, 10_113]; // smaller test set: // let widths = vec![1_000_0013]; // let steps = vec![10000]; for width in widths.iter() { for step in steps.iter() { let bedtools_output = Command::new("bedtools") .arg("makewindows") .arg("-g") .arg("tests_data/hg38_seqlens.tsv") .arg("-w") .arg(width.to_string()) .arg("-s") .arg(step.to_string()) .output() .expect("bedtools slop failed"); let granges_output = Command::new(granges_binary_path()) .arg("windows") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--width") .arg(width.to_string()) .arg("--step") .arg(step.to_string()) .output() .expect("granges windows failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); assert_stdout_eq!(bedtools_output, granges_output); } } } #[test] fn test_against_bedtools_map() { let num_ranges = BED_LENGTH; let width = 1_000_000; #[allow(unused_variables)] let step = 10_000; // can uncomment lines below to test this // make windows let windows_file = temp_bedfile(); let granges_windows_output = Command::new(granges_binary_path()) .arg("windows") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--width") .arg(width.to_string()) // .arg("--step") // .arg(step.to_string()) .arg("--output") .arg(windows_file.path()) .output() .expect("granges windows failed"); assert!( granges_windows_output.status.success(), "{:?}", granges_windows_output ); // we're going to test all of these operations // TODO/TEST need to test collapse let operations = vec!["sum", "min", "max", "mean", "median"]; for operation in operations { // create the random data BED5 let bedscores_file = random_bed5file(num_ranges); let bedtools_path = temp_bedfile(); let bedtools_output_file = File::create(&bedtools_path).unwrap(); // compare map commands let bedtools_output = Command::new("bedtools") .arg("map") .arg("-a") .arg(windows_file.path()) .arg("-b") .arg(&bedscores_file.path()) .arg("-c") .arg("5") .arg("-o") .arg(operation) .stdout(Stdio::from(bedtools_output_file)) .output() .expect("bedtools map failed"); let granges_output_file = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("map") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg(windows_file.path()) .arg("--right") .arg(bedscores_file.path()) .arg("--func") .arg(operation) .arg("--output") .arg(granges_output_file.path()) .output() .expect("granges map failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); validate_bedfloats( bedtools_path.path(), granges_output_file.path().to_path_buf(), &genome, 1e-6, format!("operation: {}", operation).into(), ); } } #[test] fn test_against_bedtools_map_multiple() { // try multiple operations at once let num_ranges = BED_LENGTH; let width = 1_000_000; // #[allow(unused_variables)] // let step = 10_000; // can uncomment lines below to test this // make windows let windows_file = temp_bedfile(); let granges_windows_output = Command::new(granges_binary_path()) .arg("windows") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--width") .arg(width.to_string()) // .arg("--step") // .arg(step.to_string()) .arg("--output") .arg(windows_file.path()) .output() .expect("granges windows failed"); assert!( granges_windows_output.status.success(), "{:?}", granges_windows_output ); // copy_tempfile_for_inspection(&windows_file.path(), "windows.bed"); // create the random data BED5 let bedscores_file = random_bed5file(num_ranges); // copy_tempfile_for_inspection(&bedscores_file.path(), "scores.bed"); let bedtools_path = temp_bedfile(); let bedtools_output_file = File::create(&bedtools_path).unwrap(); // compare map commands let bedtools_output = Command::new("bedtools") .arg("map") .arg("-a") .arg(windows_file.path()) .arg("-b") .arg(&bedscores_file.path()) .arg("-c") .arg("5") .arg("-o") .arg("min,max,mean,sum,median") .stdout(Stdio::from(bedtools_output_file)) .output() .expect("bedtools map failed"); // copy_tempfile_for_inspection(&bedtools_path.path(), "bedtools.bed"); let granges_output_file = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("map") .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg(windows_file.path()) .arg("--right") .arg(bedscores_file.path()) .arg("--func") .arg("min,max,mean,sum-not-empty,median") .arg("--output") .arg(granges_output_file.path()) .output() .expect("granges map failed"); // copy_tempfile_for_inspection(&granges_output_file.path(), "granges.bed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); #[derive(Deserialize)] struct Stats { #[serde(deserialize_with = "bed_missing")] min: Option, #[serde(deserialize_with = "bed_missing")] max: Option, #[serde(deserialize_with = "bed_missing")] mean: Option, #[serde(deserialize_with = "bed_missing")] sum: Option, #[serde(deserialize_with = "bed_missing")] median: Option, } // head_file(&bedtools_path.path()); let bedtools_iter = TsvRecordIterator::>::new(bedtools_path.path()).expect("HERE"); let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); let granges_iter = TsvRecordIterator::>::new( granges_output_file.path().to_path_buf(), ) .unwrap(); let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); let granges_data = granges_gr.take_data().unwrap(); let bedtools_data = bedtools_gr.take_data().unwrap(); granges_data .iter() .zip(bedtools_data.iter()) .for_each(|(gr, bd)| { // dbg!((gr.min, bd.min)); assert_option_float_tol!(gr.min, bd.min); assert_option_float_tol!(gr.max, bd.max); // NOTE: this breaks because with bedools sum, // zero overlapping ranges = '.' (None), not 0.0. // Hence, our sum above is sum-not-empty assert_option_float_tol!(gr.sum, bd.sum); assert_option_float_tol!(gr.median, bd.median); assert_option_float_tol!(gr.mean, bd.mean); }); } #[test] fn test_against_bedtools_merge_empty() { let num_ranges = BED_LENGTH; let random_bedfile_path = random_bed3file(num_ranges); let distances = vec![0, 1, 10, 20]; for distance in distances { let bedtools_output = Command::new("bedtools") .arg("merge") .arg("-i") .arg(&random_bedfile_path.path()) .arg("-d") .arg(distance.to_string()) .output() .expect("bedtools merge failed"); let granges_output = Command::new(granges_binary_path()) .arg("merge") .arg("--bedfile") .arg(&random_bedfile_path.path()) .arg("-d") .arg(distance.to_string()) .output() .expect("granges merge failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); assert_stdout_eq!(bedtools_output, granges_output); } } #[test] fn test_against_bedtools_merge_map() { let num_ranges = BED_LENGTH; let bedscores_file = random_bed5file(num_ranges); let distances = vec![0, 1, 10, 20]; let operations = vec!["sum", "min", "max", "mean", "median"]; for operation in operations { for distance in &distances { let bedtools_path = temp_bedfile(); let bedtools_output_file = File::create(&bedtools_path).unwrap(); let bedtools_output = Command::new("bedtools") .arg("merge") .arg("-i") .arg(&bedscores_file.path()) .arg("-d") .arg(distance.to_string()) .arg("-c") .arg("5") // the score column .arg("-o") .arg(operation) .stdout(Stdio::from(bedtools_output_file)) .output() .expect("bedtools merge failed"); let granges_output_file = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("merge") .arg("--bedfile") .arg(&bedscores_file.path()) .arg("-d") .arg(distance.to_string()) .arg("--func") .arg(operation) .arg("--output") .arg(granges_output_file.path()) .output() .expect("granges merge failed"); assert!(bedtools_output.status.success(), "{:?}", bedtools_output); // head_file(granges_output_file.path()); assert!(granges_output.status.success(), "{:?}", granges_output); // head_file(bedtools_path.path()); let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); validate_bedfloats( bedtools_path.path(), granges_output_file.path().to_path_buf(), &genome, 1e-6, format!("operation: {}, distance: {}", operation, distance).into(), ); } } }