use cudarc::driver::*; use cudarc::nvrtc::compile_ptx; use std::thread; const KERNEL_SRC: &str = " extern \"C\" __global__ void hello_world(int i) { printf(\"Hello from the cuda kernel in thread %d\\n\", i); } "; fn main() -> Result<(), DriverError> { let cfg = LaunchConfig { grid_dim: (1, 1, 1), block_dim: (1, 1, 1), shared_mem_bytes: 0, }; { // Option 1: use the same device on each thread. // This requires calling the CudaDevice::bind_to_thread() method. // Note that all kernels are submitted to the same stream/context, // so the kernels will still execute in sequentially in the order // they are submitted to the gpu. let dev = CudaDevice::new(0)?; let ptx = compile_ptx(KERNEL_SRC).unwrap(); dev.load_ptx(ptx, "kernel", &["hello_world"])?; // explicit borrow so we don't have to re-clone the device for each thread let dev = &dev; thread::scope(move |s| { for i in 0..10i32 { s.spawn(move || { // NOTE: this is the important call to have // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT dev.bind_to_thread()?; let f = dev.get_func("kernel", "hello_world").unwrap(); unsafe { f.launch(cfg, (i,)) } }); } }); } { // Option 2: create a new device in each thread // This requires loading the PTX for each device, since they won't // share a loaded modules on the Rust side of things. let ptx = compile_ptx(KERNEL_SRC).unwrap(); thread::scope(|s| { for i in 0..10i32 { let ptx = ptx.clone(); s.spawn(move || { let dev = CudaDevice::new(0)?; dev.load_ptx(ptx, "kernel", &["hello_world"])?; let f = dev.get_func("kernel", "hello_world").unwrap(); unsafe { f.launch(cfg, (i + 100,)) } }); } }); } Ok(()) }