Crates.io | krnl-macros |
lib.rs | krnl-macros |
version | 0.1.1 |
source | src |
created_at | 2023-04-10 23:52:16.448623 |
updated_at | 2024-05-28 01:41:46.749949 |
description | Macros for krnl. |
homepage | https://github.com/charles-r-earp/krnl |
repository | https://github.com/charles-r-earp/krnl |
max_upload_size | |
id | 835573 |
size | 72,563 |
Safe, portable, high performance compute (GPGPU) kernels.
Developed for autograph.
Kernel compiler for krnl.
See the docs for installation and usage instructions.
For device functionality (kernels), install Vulkan for your platform.
vulkaninfo --summary
shows your devices.
cargo test --test integration_tests -- --exact none
shows your devices.
cargo test --all-features
.See the docs or build them locally with cargo doc --all-features
.
use krnl::{
macros::module,
anyhow::Result,
device::Device,
buffer::{Buffer, Slice, SliceMut},
};
#[module]
mod kernels {
#[cfg(not(target_arch = "spirv"))]
use krnl::krnl_core;
use krnl_core::macros::kernel;
pub fn saxpy_impl(alpha: f32, x: f32, y: &mut f32) {
*y += alpha * x;
}
// Item kernels for iterator patterns.
#[kernel]
pub fn saxpy(alpha: f32, #[item] x: f32, #[item] y: &mut f32) {
saxpy_impl(alpha, x, y);
}
// General purpose kernels like CUDA / OpenCL.
#[kernel]
pub fn saxpy_global(alpha: f32, #[global] x: Slice<f32>, #[global] y: UnsafeSlice<f32>) {
use krnl_core::buffer::UnsafeIndex;
let global_id = kernel.global_id();
if global_id < x.len().min(y.len()) {
saxpy_impl(alpha, x[global_id], unsafe { y.unsafe_index_mut(global_id) });
}
}
}
fn saxpy(alpha: f32, x: Slice<f32>, mut y: SliceMut<f32>) -> Result<()> {
if let Some((x, y)) = x.as_host_slice().zip(y.as_host_slice_mut()) {
x.iter()
.copied()
.zip(y.iter_mut())
.for_each(|(x, y)| kernels::saxpy_impl(alpha, x, y));
return Ok(());
}
if true {
kernels::saxpy::builder()?
.build(y.device())?
.dispatch(alpha, x, y)
} else {
// or
kernels::saxpy_global::builder()?
.build(y.device())?
.with_global_threads(y.len() as u32)
.dispatch(alpha, x, y)
}
}
fn main() -> Result<()> {
let x = vec![1f32];
let alpha = 2f32;
let y = vec![0f32];
let device = Device::builder().build().ok().unwrap_or(Device::host());
let x = Buffer::from(x).into_device(device.clone())?;
let mut y = Buffer::from(y).into_device(device.clone())?;
saxpy(alpha, x.as_slice(), y.as_slice_mut())?;
let y = y.into_vec()?;
println!("{y:?}");
Ok(())
}
NVIDIA GeForce GTX 1060 with Max-Q Design
krnl |
cuda |
ocl |
|
---|---|---|---|
1,000,000 |
316.90 ns (✅ 1.00x) |
112.84 us (❌ 356.06x slower) |
495.45 ns (❌ 1.56x slower) |
10,000,000 |
318.15 ns (✅ 1.00x) |
1.10 ms (❌ 3454.98x slower) |
506.82 ns (❌ 1.59x slower) |
64,000,000 |
317.56 ns (✅ 1.00x) |
6.31 ms (❌ 19854.77x slower) |
506.15 ns (❌ 1.59x slower) |
krnl |
cuda |
ocl |
|
---|---|---|---|
1,000,000 |
332.66 us (✅ 1.00x) |
359.18 us (✅ 1.08x slower) |
773.51 us (❌ 2.33x slower) |
10,000,000 |
4.83 ms (✅ 1.00x) |
3.69 ms (✅ 1.31x faster) |
8.76 ms (❌ 1.81x slower) |
64,000,000 |
25.24 ms (✅ 1.00x) |
24.34 ms (✅ 1.04x faster) |
57.02 ms (❌ 2.26x slower) |
krnl |
cuda |
ocl |
|
---|---|---|---|
1,000,000 |
584.39 us (✅ 1.00x) |
447.38 us (✅ 1.31x faster) |
20.17 ms (❌ 34.52x slower) |
10,000,000 |
5.67 ms (✅ 1.00x) |
4.03 ms (✅ 1.41x faster) |
20.15 ms (❌ 3.55x slower) |
64,000,000 |
28.82 ms (✅ 1.00x) |
25.57 ms (✅ 1.13x faster) |
37.01 ms (❌ 1.28x slower) |
krnl |
cuda |
ocl |
|
---|---|---|---|
1,000,000 |
38.15 us (✅ 1.00x) |
25.28 us (✅ 1.51x faster) |
34.12 us (✅ 1.12x faster) |
10,000,000 |
250.90 us (✅ 1.00x) |
242.95 us (✅ 1.03x faster) |
251.86 us (✅ 1.00x slower) |
64,000,000 |
1.53 ms (✅ 1.00x) |
1.55 ms (✅ 1.01x slower) |
1.56 ms (✅ 1.02x slower) |
krnl |
cuda |
ocl |
|
---|---|---|---|
1,000,000 |
90.76 us (✅ 1.00x) |
81.16 us (✅ 1.12x faster) |
88.94 us (✅ 1.02x faster) |
10,000,000 |
746.92 us (✅ 1.00x) |
770.03 us (✅ 1.03x slower) |
779.90 us (✅ 1.04x slower) |
64,000,000 |
4.71 ms (✅ 1.00x) |
4.90 ms (✅ 1.04x slower) |
4.91 ms (✅ 1.04x slower) |
Dual-licensed to be compatible with the Rust project.
Licensed under the Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 or the MIT license http://opensource.org/licenses/MIT, at your option. This file may not be copied, modified, or distributed except according to those terms.
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.