/// ------------Lamellar Bandwidth: AM flops ------------------------- /// similar to the AM bandwidth tests but instead of transferring data /// each active message performs some number of dummy multiply add operations /// Not to be used as a true measure of the flops of a system but can be /// useful to compare multiple systems and/or perform worksize to /// RT latency analyses /// -------------------------------------------------------------------- use lamellar::ActiveMessaging; use std::time::Instant; // #[cfg(feature = "nightly")] //use packed_simd::{f64x8, Simd}; #[lamellar::AmData(Clone, Debug)] struct FlopAM { iterations: usize, } #[lamellar::am] impl LamellarAM for FlopAM { async fn exec(&self) -> usize { let mut a = [1.2345f64; 128]; for _i in 0..self.iterations { for a_i in a.iter_mut() { *a_i = (*a_i) * (*a_i) + (*a_i); } } let temp_ops = self.iterations * a.len() * 2; let num_ops = if temp_ops > a.len() * 2 { temp_ops as usize } else { //this should never actually happen but should prevent optimizing A away a[0] as usize }; num_ops } } // #[cfg(feature = "nightly")] // #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] // struct SimdAM { // iterations: usize, // } // #[cfg(feature = "nightly")] // #[lamellar::am] // impl LamellarAM for SimdAM { // async fn exec(&self) -> usize { // let mut a: [Simd<[f64; 8]>; 16] = [f64x8::new( // 1.2345, 1.2345, 1.2345, 1.2345, 1.2345, 1.2345, 1.2345, 1.2345, // ); 16]; // for _i in 0..self.iterations { // for a_i in a.iter_mut() { // *a_i = (*a_i) * (*a_i) + (*a_i); // } // } // let temp_ops = self.iterations * a.len() * 8 * 2; // let num_ops = if temp_ops > a.len() * 2 { // temp_ops as usize // } else { // //this should never actually happen but should prevent optimizing A away // a[0].extract(1) as usize // }; // num_ops // } // } fn main() { let world = lamellar::LamellarWorldBuilder::new() // .with_lamellae( Backend::Local ) .build(); let my_pe = world.my_pe(); let num_pes = world.num_pes(); world.barrier(); let s = Instant::now(); world.barrier(); let b = s.elapsed().as_secs_f64(); println!("Barrier latency: {:?}s {:?}us", b, b * 1_000_000 as f64); if my_pe == 0 { println!("==================Flop test==========================="); } let mut flops = vec![]; let num_tasks = 1000; let num_cores = match std::env::var("LAMELLAR_THREADS") { Ok(n) => n.parse::().unwrap() as f64, Err(_) => 4 as f64, }; for i in 0..27 { let num_iterations = 2_u64.pow(i) as usize; let timer = Instant::now(); let mut sub_time = 0f64; let mut reqs = vec![]; if my_pe == 0 { for _j in 0..num_tasks { let sub_timer = Instant::now(); reqs.push( world .exec_am_all(FlopAM { iterations: num_iterations, }) .spawn(), ); sub_time += sub_timer.elapsed().as_secs_f64(); } println!("issue time: {:?}", timer.elapsed().as_secs_f64()); world.wait_all(); } world.barrier(); let cur_t = timer.elapsed().as_secs_f64(); let tot_flop: usize = reqs .drain(0..) .map(|r| r.block().drain(0..).sum::()) .sum(); let task_granularity = ((cur_t * num_cores) / (num_tasks * num_pes) as f64) * 1000.0f64; if my_pe == 0 { println!( "iter size: {:?} tot_flop: {:?} time: {:?} (issue time: {:?}) GFLOPS (avg): {:?} ({:?}%) task_gran: {:?}(ms)", num_iterations, //transfer size tot_flop, //num transfers cur_t, //transfer time sub_time, ((tot_flop as f64) / cur_t) / 1_000_000_000f64, // throughput of user payload (((tot_flop as f64) / cur_t) / 1_000_000_000f64) / 2800f64, task_granularity, ); flops.push(((tot_flop as f64) / cur_t) / 1_000_000_000f64); } // #[cfg(feature = "nightly")] // { // reqs.clear(); // let timer = Instant::now(); // let mut sub_time = 0f64; // if my_pe == 0 { // for _j in 0..num_tasks { // let sub_timer = Instant::now(); // reqs.push(world.exec_am_all(SimdAM { // iterations: num_iterations, // })); // sub_time += sub_timer.elapsed().as_secs_f64(); // } // println!("issue time: {:?}", timer.elapsed().as_secs_f64()); // world.wait_all(); // } // world.barrier(); // let cur_t = timer.elapsed().as_secs_f64(); // let tot_flop: usize = reqs // .iter() // .map(|r| r.block().iter().map(|r| r.unwrap()).sum::()) // .sum(); // let task_granularity = ((cur_t * 24f64) / num_tasks as f64) * 1000.0f64; // if my_pe == 0 { // println!( // "nightly iter size: {:?} tot_flop: {:?} time: {:?} (issue time: {:?}) // GFLOPS (avg): {:?} ({:?}%) task_gran: {:?}(ms)", // num_iterations, //transfer size // tot_flop, //num transfers // cur_t, //transfer time // sub_time, // ((tot_flop as f64) / cur_t) / 1_000_000_000f64, // throughput of user payload // (((tot_flop as f64) / cur_t) / 1_000_000_000f64) / 2800f64, // task_granularity, // ); // flops.push(((tot_flop as f64) / cur_t) / 1_000_000_000f64); // } // } } if my_pe == 0 { println!( "flops: {}", flops .iter() .fold(String::new(), |acc, &num| acc + &num.to_string() + ", ") ); } // }); }