| Crates.io | ringkernel-cuda-codegen |
| lib.rs | ringkernel-cuda-codegen |
| version | 0.4.0 |
| created_at | 2025-12-03 15:44:53.693631+00 |
| updated_at | 2026-01-25 21:22:25.012459+00 |
| description | CUDA code generation from Rust DSL for RingKernel stencil kernels |
| homepage | https://github.com/mivertowski/RustCompute |
| repository | https://github.com/mivertowski/RustCompute |
| max_upload_size | |
| id | 1964337 |
| size | 472,793 |
Rust-to-CUDA transpiler for RingKernel GPU kernels.
This crate enables writing GPU kernels in a restricted Rust DSL and transpiling them to CUDA C code. It supports three kernel types:
__global__ functionsGridPos abstraction (2D and 3D)[dependencies]
ringkernel-cuda-codegen = "0.2"
syn = { version = "2.0", features = ["full"] }
For general-purpose CUDA kernels:
use ringkernel_cuda_codegen::transpile_global_kernel;
use syn::parse_quote;
let func: syn::ItemFn = parse_quote! {
fn saxpy(x: &[f32], y: &mut [f32], a: f32, n: i32) {
let idx = block_idx_x() * block_dim_x() + thread_idx_x();
if idx >= n { return; }
y[idx as usize] = a * x[idx as usize] + y[idx as usize];
}
};
let cuda_code = transpile_global_kernel(&func)?;
For grid-based computations with neighbor access (2D and 3D):
use ringkernel_cuda_codegen::{transpile_stencil_kernel, StencilConfig, Grid};
// 2D stencil
let func: syn::ItemFn = parse_quote! {
fn fdtd(p: &[f32], p_prev: &mut [f32], c2: f32, pos: GridPos) {
let lap = pos.north(p) + pos.south(p) + pos.east(p) + pos.west(p)
- 4.0 * p[pos.idx()];
p_prev[pos.idx()] = 2.0 * p[pos.idx()] - p_prev[pos.idx()] + c2 * lap;
}
};
let config = StencilConfig::new("fdtd")
.with_grid(Grid::Grid2D)
.with_tile_size(16, 16)
.with_halo(1);
let cuda_code = transpile_stencil_kernel(&func, &config)?;
// 3D stencil with up/down neighbors
let func_3d: syn::ItemFn = parse_quote! {
fn laplacian_3d(p: &[f32], out: &mut [f32], pos: GridPos) {
let lap = pos.north(p) + pos.south(p) + pos.east(p) + pos.west(p)
+ pos.up(p) + pos.down(p) - 6.0 * p[pos.idx()];
out[pos.idx()] = lap;
}
};
let config_3d = StencilConfig::new("laplacian")
.with_grid(Grid::Grid3D)
.with_tile_size(8, 8)
.with_halo(1);
For persistent actor-model kernels:
use ringkernel_cuda_codegen::{transpile_ring_kernel, RingKernelConfig};
let handler: syn::ItemFn = parse_quote! {
fn process(ctx: &RingContext, msg: &Request) -> Response {
let tid = ctx.global_thread_id();
ctx.sync_threads();
Response { value: msg.value * 2.0, id: tid as u64 }
}
};
let config = RingKernelConfig::new("processor")
.with_block_size(128)
.with_queue_capacity(1024)
.with_hlc(true) // Hybrid Logical Clocks
.with_k2k(true) // Kernel-to-kernel messaging
.with_envelope_format(true) // MessageEnvelope serialization
.with_kernel_id(1) // Kernel ID for routing
.with_hlc_node_id(1); // HLC node ID
let cuda_code = transpile_ring_kernel(&handler, &config)?;
When with_envelope_format(true) is enabled, messages use a standardized MessageEnvelope format:
// MessageHeader (256 bytes, cache-aligned)
typedef struct __align__(256) {
uint32_t magic; // 0xCAFEBABE
uint32_t version; // Protocol version
uint64_t type_id; // Message type identifier
uint64_t envelope_id; // Unique envelope ID
uint64_t correlation_id; // Request/response correlation
uint64_t source_kernel; // Source kernel ID
uint64_t target_kernel; // Target kernel ID
uint64_t hlc_wall; // HLC wall clock
uint64_t hlc_logical; // HLC logical counter
uint32_t hlc_node; // HLC node ID
uint32_t priority; // Message priority
uint32_t payload_size; // Payload size in bytes
uint32_t flags; // Message flags
uint8_t reserved[168]; // Padding to 256 bytes
} MessageHeader;
// MessageEnvelope = header + payload
This enables:
thread_idx_x(), thread_idx_y(), thread_idx_z() → threadIdx.x/y/zblock_idx_x(), block_idx_y(), block_idx_z() → blockIdx.x/y/zblock_dim_x(), block_dim_y(), block_dim_z() → blockDim.x/y/zgrid_dim_x(), grid_dim_y(), grid_dim_z() → gridDim.x/y/zwarp_size() → warpSizepos.idx() - Linear indexpos.north(buf), pos.south(buf) - Y-axis neighborspos.east(buf), pos.west(buf) - X-axis neighborspos.at(buf, dx, dy) - Relative offset accesspos.up(buf), pos.down(buf) - Z-axis neighborspos.at(buf, dx, dy, dz) - 3D relative offset accesssync_threads() → __syncthreads() - Block-level barriersync_threads_count(pred) → __syncthreads_count() - Count threads with predicatesync_threads_and(pred) → __syncthreads_and() - AND of predicatesync_threads_or(pred) → __syncthreads_or() - OR of predicatethread_fence() → __threadfence() - Device memory fencethread_fence_block() → __threadfence_block() - Block memory fencethread_fence_system() → __threadfence_system() - System memory fenceatomic_add(ptr, val) → atomicAddatomic_sub(ptr, val) → atomicSubatomic_min(ptr, val) → atomicMinatomic_max(ptr, val) → atomicMaxatomic_exchange(ptr, val) → atomicExchatomic_cas(ptr, compare, val) → atomicCASatomic_and(ptr, val) → atomicAndatomic_or(ptr, val) → atomicOratomic_xor(ptr, val) → atomicXoratomic_inc(ptr, val) → atomicInc (increment with wrap)atomic_dec(ptr, val) → atomicDec (decrement with wrap)sqrt(), rsqrt() - Square root, reciprocal sqrtabs(), fabs() - Absolute valuefloor(), ceil(), round(), trunc() - Roundingfma(), mul_add() - Fused multiply-addfmin(), fmax() - Minimum, maximumfmod(), remainder() - Modulo operationscopysign() - Copy signcbrt() - Cube roothypot() - Hypotenusesin(), cos(), tan() - Basic trigasin(), acos(), atan(), atan2() - Inverse trigsincos() - Combined sine and cosinesinpi(), cospi() - Sin/cos of π*xsinh(), cosh(), tanh() - Hyperbolicasinh(), acosh(), atanh() - Inverse hyperbolicexp(), exp2(), exp10(), expm1() - Exponentialslog(), ln(), log2(), log10(), log1p() - Logarithmspow(), powf(), powi() - Powerldexp(), scalbn() - Load/scale exponentilogb() - Extract exponenterf(), erfc(), erfinv(), erfcinv() - Error functionslgamma(), tgamma() - Gamma functionsis_nan(), isnan() → isnanis_infinite(), isinf() → isinfis_finite(), isfinite() → isfiniteis_normal(), isnormal() → isnormalsignbit() - Check sign bitnextafter() - Next representable valuefdim() - Positive differencewarp_active_mask() → __activemask() - Active lane maskwarp_shfl(mask, val, lane) → __shfl_sync - Shufflewarp_shfl_up(mask, val, delta) → __shfl_up_syncwarp_shfl_down(mask, val, delta) → __shfl_down_syncwarp_shfl_xor(mask, val, lane_mask) → __shfl_xor_syncwarp_ballot(mask, pred) → __ballot_syncwarp_all(mask, pred) → __all_syncwarp_any(mask, pred) → __any_syncwarp_match_any(mask, val) → __match_any_syncwarp_match_all(mask, val) → __match_all_syncwarp_reduce_add(mask, val) → __reduce_add_syncwarp_reduce_min(mask, val) → __reduce_min_syncwarp_reduce_max(mask, val) → __reduce_max_syncwarp_reduce_and(mask, val) → __reduce_and_syncwarp_reduce_or(mask, val) → __reduce_or_syncwarp_reduce_xor(mask, val) → __reduce_xor_syncpopc(), popcount(), count_ones() → __popc - Population countclz(), leading_zeros() → __clz - Count leading zerosctz(), trailing_zeros() → __ffs - 1 - Count trailing zerosffs() → __ffs - Find first setbrev(), reverse_bits() → __brev - Bit reversebyte_perm() → __byte_perm - Byte permutationfunnel_shift_left() → __funnelshift_lfunnel_shift_right() → __funnelshift_rldg(ptr), load_global(ptr) → __ldg - Read-only cache loadprefetch_l1(ptr) → __prefetch_l1 - L1 prefetchprefetch_l2(ptr) → __prefetch_l2 - L2 prefetchrcp(), recip() → __frcp_rn - Fast reciprocalfast_div() → __fdividef - Fast divisionsaturate(), clamp_01() → __saturatef - Saturate to [0,1]j0(), j1(), jn() - Bessel functions of first kindy0(), y1(), yn() - Bessel functions of second kindnormcdf(), normcdfinv() - Normal CDFcyl_bessel_i0(), cyl_bessel_i1() - Cylindrical Bessel functionsclock() → clock() - 32-bit clock counterclock64() → clock64() - 64-bit clock counternanosleep(ns) → __nanosleep - Sleep for nanosecondsctx.thread_id() → threadIdx.xctx.block_id() → blockIdx.xctx.global_thread_id() → (blockIdx.x * blockDim.x + threadIdx.x)ctx.sync_threads() → __syncthreads()ctx.lane_id() → (threadIdx.x % 32)ctx.warp_id() → (threadIdx.x / 32)is_active(), should_terminate(), mark_terminated()messages_processed(), input_queue_size(), output_queue_size()input_queue_empty(), output_queue_empty(), enqueue_response(&resp)hlc_tick(), hlc_update(ts), hlc_now() - HLC operationsk2k_send(target, &msg) - Send message to target kernelk2k_send_envelope(&envelope) - Send full envelope with routingk2k_try_recv() - Non-blocking receivek2k_try_recv_envelope() - Receive with envelope metadatak2k_has_message(), k2k_peek(), k2k_pending_count()k2k_get_source_kernel() - Get source kernel ID from envelopek2k_get_correlation_id() - Get correlation ID for request/response| Rust Type | CUDA Type |
|---|---|
f32 |
float |
f64 |
double |
i32 |
int |
u32 |
unsigned int |
i64 |
long long |
u64 |
unsigned long long |
bool |
int |
&[T] |
const T* __restrict__ |
&mut [T] |
T* __restrict__ |
The transpiler supports 120+ GPU intrinsics across 13 categories:
| Category | Count | Examples |
|---|---|---|
| Synchronization | 7 | sync_threads, thread_fence |
| Atomics | 11 | atomic_add, atomic_cas, atomic_and |
| Math | 16 | sqrt, fma, cbrt, hypot |
| Trigonometric | 11 | sin, asin, atan2, sincos |
| Hyperbolic | 6 | sinh, asinh |
| Exponential | 18 | exp, log2, erf, gamma |
| Classification | 8 | isnan, isfinite, signbit |
| Warp | 16 | warp_shfl, warp_reduce_add, warp_match_any |
| Bit Manipulation | 8 | popc, clz, brev, funnel_shift_left |
| Memory | 3 | ldg, prefetch_l1 |
| Special | 13 | rcp, saturate, normcdf |
| Index | 13 | thread_idx_x, warp_size |
| Timing | 3 | clock, clock64, nanosleep |
cargo test -p ringkernel-cuda-codegen
The crate includes 183 tests covering all kernel types, intrinsics, envelope format, and language features.
Apache-2.0