//! some testing utilities. //! run with to get output statistics : cargo test --release -- --nocapture --test test_parallel. //! serial test corresponds to random-10nn-euclidean(k=10) //! parallel test corresponds to random data in 25 dimensions k = 10, dist Cosine use rand::distributions::Uniform; use rand::prelude::*; use skiplist::OrderedSkipList; use anndists::dist; use hnsw_rs::prelude::*; use serde::{de::DeserializeOwned, Serialize}; pub fn gen_random_vector_f32(nbrow: usize) -> Vec { let mut rng = thread_rng(); let unif = Uniform::::new(0., 1.); let vec = (0..nbrow) .into_iter() .map(|_| rng.sample(unif)) .collect::>(); vec } /// return nbcolumn vectors of dimension nbrow pub fn gen_random_matrix_f32(nbrow: usize, nbcolumn: usize) -> Vec> { let mut rng = thread_rng(); let unif = Uniform::::new(0., 1.); let mut data = Vec::with_capacity(nbcolumn); for _ in 0..nbcolumn { let column = (0..nbrow) .into_iter() .map(|_| rng.sample(unif)) .collect::>(); data.push(column); } return data; } fn brute_force_neighbours( nb_neighbours: usize, refdata: &PointIndexation, distance: PointDistance, data: &Vec, ) -> OrderedSkipList { let mut neighbours = OrderedSkipList::::with_capacity(refdata.get_nb_point()); let mut ptiter = refdata.into_iter(); let mut more = true; while more { if let Some(point) = ptiter.next() { let dist_p = distance.eval(data, point.get_v()); let ordered_point = PointIdWithOrder::new(point.get_point_id(), dist_p); // log::debug!(" brute force inserting {:?}", ordered_point); if neighbours.len() < nb_neighbours { neighbours.insert(ordered_point); } else { neighbours.insert(ordered_point); neighbours.pop_back(); } } else { more = false; } } // end while neighbours } // end of brute_force_2 //================================================================================================ mod tests { use cpu_time::ProcessTime; use std::time::Duration; use super::*; use dist::l2_normalize; #[test] fn test_serial() { // // let nb_elem = 1000; let dim = 10; let knbn = 10; let ef = 20; let parallel = true; // println!("\n\n test_serial nb_elem {:?}", nb_elem); // let data = gen_random_matrix_f32(dim, nb_elem); let data_with_id = data.iter().zip(0..data.len()).collect(); let ef_c = 400; let max_nb_connection = 32; let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize); let mut hns = Hnsw::::new( max_nb_connection, nb_elem, nb_layer, ef_c, dist::DistL1 {}, ); hns.set_extend_candidates(true); hns.set_keeping_pruned(true); let mut start = ProcessTime::now(); if parallel { println!("parallel insertion"); hns.parallel_insert(&data_with_id); } else { println!("serial insertion"); for i in 0..data.len() { hns.insert((&data[i], i)); } } let mut cpu_time: Duration = start.elapsed(); println!(" hnsw serial data insertion {:?}", cpu_time); hns.dump_layer_info(); println!(" hnsw data nb point inserted {:?}", hns.get_nb_point()); // let nbtest = 300; let mut recalls = Vec::::with_capacity(nbtest); let mut nb_returned = Vec::::with_capacity(nb_elem); let mut search_times = Vec::::with_capacity(nbtest); for _itest in 0..nbtest { // let mut r_vec = Vec::::with_capacity(dim); let mut rng = thread_rng(); let unif = Uniform::::new(0., 1.); for _ in 0..dim { r_vec.push(rng.sample(unif)); } start = ProcessTime::now(); let brute_neighbours = brute_force_neighbours( knbn, hns.get_point_indexation(), Box::new(dist::DistL1 {}), &r_vec, ); cpu_time = start.elapsed(); if nbtest <= 100 { println!("\n\n **************** test {:?}", _itest); println!("\n brute force neighbours :"); println!("======================"); println!(" brute force computing {:?} \n ", cpu_time); for i in 0..brute_neighbours.len() { let p = brute_neighbours[i].point_id; println!(" {:?} {:?} ", p, brute_neighbours[i].dist_to_ref); } } // hns.set_searching_mode(true); start = ProcessTime::now(); let knn_neighbours = hns.search(&r_vec, knbn, ef); cpu_time = start.elapsed(); search_times.push(cpu_time.as_micros() as f32); if nbtest <= 100 { println!("\n\n hnsw searching {:?} \n", cpu_time); println!("\n knn neighbours"); println!("======================"); for n in &knn_neighbours { println!(" {:?} {:?} {:?} ", n.d_id, n.p_id, n.distance); } } // compute recall let knn_neighbours_dist: Vec = knn_neighbours.iter().map(|p| p.distance).collect(); let max_dist = brute_neighbours[knbn - 1].dist_to_ref; let recall = knn_neighbours_dist .iter() .filter(|d| *d <= &max_dist) .count(); if nbtest <= 100 { println!("recall {:?}", (recall as f32) / (knbn as f32)); } recalls.push(recall); nb_returned.push(knn_neighbours.len()); } // end on nbtest // // compute recall // let mean_recall = (recalls.iter().sum::() as f32) / ((knbn * recalls.len()) as f32); let mean_search_time = (search_times.iter().sum::() as f32) / (search_times.len() as f32); println!( "\n mean fraction (of knbn) returned by search {:?} ", (nb_returned.iter().sum::() as f32) / ((nb_returned.len() * knbn) as f32) ); println!( "\n nb element {:?} nb search : {:?} recall rate is {:?} search time inverse {:?} ", nb_elem, nbtest, mean_recall, 1.0e+6_f32 / mean_search_time ); } // end test1 #[test] fn test_parallel() { // let nb_elem = 1000; let dim = 25; let knbn = 10; let ef_c = 800; let max_nb_connection = 48; let ef = 20; // // let mut data = gen_random_matrix_f32(dim, nb_elem); for i in 0..data.len() { l2_normalize(&mut data[i]); } let data_with_id = data.iter().zip(0..data.len()).collect(); let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize); let mut hns = Hnsw::::new( max_nb_connection, nb_elem, nb_layer, ef_c, dist::DistDot {}, ); // ! // hns.set_extend_candidates(true); let mut start = ProcessTime::now(); let now = std::time::SystemTime::now(); // parallel insertion hns.parallel_insert(&data_with_id); let mut cpu_time: Duration = start.elapsed(); println!( "\n hnsw data parallel insertion cpu time {:?} , system time {:?}", cpu_time, now.elapsed() ); // one serial more to check let mut v = gen_random_vector_f32(dim); l2_normalize(&mut v); hns.insert((&v, hns.get_nb_point() + 1)); // hns.dump_layer_info(); println!(" hnsw data nb point inserted {:?}", hns.get_nb_point()); // println!("\n hnsw testing requests ..."); let nbtest = 100; let mut recalls = Vec::::with_capacity(nbtest); let mut recalls_id = Vec::::with_capacity(nbtest); let mut search_times = Vec::::with_capacity(nbtest); for _itest in 0..nbtest { let mut r_vec = Vec::::with_capacity(dim); let mut rng = thread_rng(); let unif = Uniform::::new(0., 1.); for _ in 0..dim { r_vec.push(rng.sample(unif)); } l2_normalize(&mut r_vec); start = ProcessTime::now(); let brute_neighbours = brute_force_neighbours( knbn, hns.get_point_indexation(), Box::new(dist::DistDot), &r_vec, ); cpu_time = start.elapsed(); if nbtest <= 100 { println!("\n\n test_par nb_elem {:?}", nb_elem); println!("\n brute force neighbours :"); println!("======================"); println!(" brute force computing {:?} \n", cpu_time); for i in 0..brute_neighbours.len() { println!( " {:?} {:?} ", brute_neighbours[i].point_id, brute_neighbours[i].dist_to_ref ); } } // let knbn = 10; hns.set_searching_mode(true); start = ProcessTime::now(); let knn_neighbours = hns.search(&r_vec, knbn, ef); cpu_time = start.elapsed(); search_times.push(cpu_time.as_micros() as f32); if nbtest <= 100 { println!("\n knn neighbours"); println!("======================"); println!(" hnsw searching {:?} \n", cpu_time); for n in &knn_neighbours { println!(" {:?} \t {:?} \t {:?}", n.d_id, n.p_id, n.distance); } } // compute recall with balls let knn_neighbours_dist: Vec = knn_neighbours.iter().map(|p| p.distance).collect(); let max_dist = brute_neighbours[knbn - 1].dist_to_ref; let recall = knn_neighbours_dist .iter() .filter(|d| *d <= &max_dist) .count(); if nbtest <= 100 { println!("recall {:?}", (recall as f32) / (knbn as f32)); } recalls.push(recall); // compute recall with id let mut recall_id = 0; let mut knn_neighbours_id: Vec = knn_neighbours.iter().map(|p| p.p_id).collect(); knn_neighbours_id.sort_unstable(); let snbn = knbn.min(brute_neighbours.len()); for j in 0..snbn { let to_search = brute_neighbours[j].point_id; if knn_neighbours_id.binary_search(&to_search).is_ok() { recall_id += 1; } } recalls_id.push(recall_id); } // end on nbtest // // compute recall // let mean_recall = (recalls.iter().sum::() as f32) / ((knbn * recalls.len()) as f32); let mean_search_time = (search_times.iter().sum::() as f32) / (search_times.len() as f32); println!( "\n nb search {:?} recall rate is {:?} search time inverse {:?} ", nbtest, mean_recall, 1.0e+6_f32 / mean_search_time ); let mean_recall_id = (recalls.iter().sum::() as f32) / ((knbn * recalls.len()) as f32); println!("mean recall rate with point ids {:?}", mean_recall_id); // // assert!(1==0); } // end test_par }