use rand::Rng; use sgpu_compute::prelude::*; #[derive(Debug, Copy, Clone, bytemuck::Zeroable, bytemuck::Pod)] #[repr(C)] struct Uniform { width: u32, } fn gen() -> [f32; N] { let mut out = [0.0; N]; rand::thread_rng().fill(&mut out[..]); out } fn parallel_prefix(input: &[f32; N]) -> [f32; N] { let mut out = [0.0; N]; let mut tot = 0.0; for i in 0..N { tot += input[i]; out[i] = tot; } return out; } fn main() { const PER_WORKER: u32 = 4; const N: usize = 3000; const N_WORKER: u32 = (N as u32 - 1) / PER_WORKER + 1; const N_WG: u32 = (N_WORKER - 1) / 16 + 1; const N_PADDED: usize = (N_WG * 16 * PER_WORKER) as _; let input: [f32; N] = gen(); let gpu = GpuCompute::new(); let mut pipeline = gpu.gen_pipeline( NonZeroUsize::new(std::mem::size_of::() * N_PADDED as usize / PER_WORKER as usize), [ StageDesc { name: Some("first_pass"), shader: include_str!("parallel_prefix.wgsl"), entrypoint: "pass1", }, StageDesc { name: Some("second_pass"), shader: include_str!("parallel_prefix.wgsl"), entrypoint: "pass2", }, StageDesc { name: Some("last_pass"), shader: include_str!("parallel_prefix.wgsl"), entrypoint: "pass3", }, ], ); pipeline.write_uniform(&Uniform { width: PER_WORKER }); let mut input_padded = [0.0; N_PADDED]; input_padded[..N].copy_from_slice(&input[..]); let result: [f32; N] = pipeline.run( &input_padded, [(N_WG as _, 1, 1), (1, 1, 1), (N_WG, 1, 1)], |vals: &[f32; N_PADDED]| { let mut res = [0.0f32; N]; res.copy_from_slice(&vals[..N]); res }, ); let expected = parallel_prefix(&input); for (i, (v, exp)) in result.iter().zip(expected.iter()).enumerate() { if (v / exp - 1.0).abs() > i as f32 * f32::EPSILON { pipeline.dbg_print_scratchpad::<[f32; N_PADDED / PER_WORKER as usize]>(); println!("Error at idx {}: {} ≠ {}", i, v, exp); } } }