use mcl_rs; use rand::Rng; fn gemm_seq(a: &Vec, b: &Vec, c: &mut Vec, n: usize) { for i in 0..n { for j in 0..n { for k in 0..n { c[i * n + j] += a[i * n + k] * b[k * n + j] } } } } async fn gemm_mcl( env: &mcl_rs::Mcl, a: &Vec, b: &Vec, cs: &mut Vec>, n: &usize, sync: &bool, ) { let mut hdls = Vec::new(); // let tile_size = a.len()/num_tiles; let tile_size = 64 * n; // * std::mem::size_of; let num_tiles = a.len() / tile_size; println!("{:?} {num_tiles} {tile_size}", a.len()); let pes: [u64; 3] = [64 as u64, *n as u64, 1]; println!("cs {:?}", cs.len()); for c in cs.iter_mut() { println!("here"); let buffer = env.register_buffer( mcl_rs::TaskArg::output_slice(c) .resident(true) .dynamic(true), ); //this is reference counted so it will remain active as long as this handle remains in scope, or and task handle using it remains in scope for j in 0..num_tiles { let s_i = tile_size * j; let e_i = s_i + tile_size; let new_a = &a[s_i..e_i]; println!("new_a {:?}", new_a.as_ptr()); hdls.push( env.task("gemmN", 4) .arg(mcl_rs::TaskArg::input_slice(new_a).resident(true)) .arg(mcl_rs::TaskArg::input_slice(b).resident(true)) .arg(mcl_rs::TaskArg::input_scalar(n)) .arg_buffer(buffer.sub_buffer(s_i..e_i)) //we do runtime reference counting to ensure memory safety .dev(mcl_rs::DevType::ANY) .exec(pes), ); if *sync { hdls.pop().expect("Task just pushed to vec").await; } } // buffer now out of scope, but it remains registered as handles containing it now are in "hdls" } if !*sync { futures::future::join_all(hdls).await; } // all references to the registered buffers have been dropped at this point so they will automatically be unregistered } static REPS: usize = 10; #[test] fn tiled_gemm() { let workers = 2; let n = 128; let nn = n * n; let reps = REPS; // reps += 10; let sync = true; let env = mcl_rs::MclEnvBuilder::new() .num_workers(workers) .initialize(); env.load_prog("tests/gemmN.cl", mcl_rs::PrgType::Src); let mut rng = rand::thread_rng(); // Generate a and b matrices of size NxN and initialize with random numbers in [0, 100) let a: Vec = (0..nn) .enumerate() .map(|(_, _)| rng.gen_range(0..100)) .collect(); let b: Vec = (0..nn) .enumerate() .map(|(_, _)| rng.gen_range(0..100)) .collect(); // Allocate the c matrix that will hold the results. let mut cs: Vec> = vec![vec![0; nn]; reps]; //we need a buffer for each result let mut c_seq: Vec = vec![0; nn]; let start = std::time::Instant::now(); gemm_seq(&a, &b, &mut c_seq, n); let seq_time = start.elapsed().as_secs_f32(); println!("{:?} {:?}", &c_seq[0..10], &c_seq[(nn - 10)..nn]); println!("Sync mcl gemm"); let start = std::time::Instant::now(); futures::executor::block_on(gemm_mcl(&env, &a, &b, &mut cs, &n, &sync)); let sync_time = start.elapsed().as_secs_f32(); // println!("{:?}",cs); for c in cs { // for (i,(e1,e2)) in c.iter().zip(c_seq.iter()).enumerate(){ // assert_eq!(e1, e2,"failed at index {i}"); // } assert_eq!(c_seq, c); } let mut cs = vec![vec![0; nn]; reps]; //we need a buffer for each result let sync = false; println!("Async mcl gemm"); let start = std::time::Instant::now(); futures::executor::block_on(gemm_mcl(&env, &a, &b, &mut cs, &n, &sync)); let async_time = start.elapsed().as_secs_f32(); for c in cs { assert_eq!(c_seq, c); } println!("Seq: {seq_time}, Async: {async_time} Sync: {sync_time}"); }