| Crates.io | unsafe_math |
| lib.rs | unsafe_math |
| version | 0.1.1 |
| created_at | 2025-06-24 10:57:06.063868+00 |
| updated_at | 2025-06-24 19:25:34.021894+00 |
| description | proc_macro to replace math operation with their unchecked/f_fast versions |
| homepage | |
| repository | https://github.com/platonvin/unsafe_math |
| max_upload_size | |
| id | 1724162 |
| size | 26,028 |
unsafe_math is a proc_macro that replaces math with unchecked / fast-math versions.
On practice, this makes math match assembly of GCC/Clang with "-Ofast"
Requires nightly
Simply add #[unsafe_math] attribute to the scope you want it to apply to
use unsafe_math::unsafe_math;
#[unsafe_math]
fn function(...) -> ... {
...
}
#[unsafe_math]
impl Trait for Type {
...
}
// you need these to invoke proc_macro on {...} statements
#![feature(stmt_expr_attributes)]
#![feature(proc_macro_hygiene)]
#[unsafe_math]
{
...
}
You can also do this:
use unsafe_math::unsafe_math_block;
unsafe_math_block! {
...
}
This section demonstrates #[unsafe_math] effect on produced assembly for few examples.
somefun_slow corresponds to slow version, somefun_fast corresponds to version with #[unsafe_math]:
fn convert(block: i32) -> i32 {
(block * 16) / 8
}
Difference in assembly will be as follows:
convert_slow:
mov eax, ecx
shl eax, 4
sar eax, 3
ret
convert_fast:
lea eax, [rcx + rcx]
ret
fn sum(a: u32) -> u32 {
(0..a).map(|i| 1 << i as u32).sum()
}
sum_sum
push rsi
test ecx, ecx
je .LBB7_1
mov r8d, ecx
mov r9d, ecx
and r9d, 3
cmp ecx, 4
jae .LBB7_4
xor eax, eax
xor edx, edx
jmp .LBB7_6
.LBB7_1:
xor eax, eax
pop rsi
ret
.LBB7_4:
and r8d, -4
xor eax, eax
xor edx, edx
.LBB7_5:
mov ecx, edx
and cl, 28
mov r10d, 1
shl r10d, cl
mov r11d, 2
shl r11d, cl
mov esi, 4
shl esi, cl
add r10d, eax
add esi, r11d
mov eax, 8
shl eax, cl
add esi, r10d
add edx, 4
add eax, esi
cmp r8d, edx
jne .LBB7_5
.LBB7_6:
test r9d, r9d
je .LBB7_8
.LBB7_7:
mov r8d, 1
mov ecx, edx
shl r8d, cl
inc edx
add eax, r8d
dec r9d
jne .LBB7_7
.LBB7_8:
pop rsi
ret
sum_fast:
test ecx, ecx
je .LBB10_1
mov edx, ecx
mov r9d, ecx
and r9d, 7
cmp ecx, 8
jae .LBB10_4
xor eax, eax
xor ecx, ecx
jmp .LBB10_6
.LBB10_1:
xor eax, eax
ret
.LBB10_4:
and edx, -8
xor eax, eax
xor r8d, r8d
.LBB10_5:
mov r10d, 255
mov ecx, r8d
shl r10d, cl
lea ecx, [r8 + 8]
add eax, r10d
mov r8d, ecx
cmp edx, ecx
jne .LBB10_5
.LBB10_6:
test r9d, r9d
je .LBB10_8
.LBB10_7:
mov edx, 1
shl edx, cl
inc ecx
add eax, edx
dec r9d
jne .LBB10_7
.LBB10_8:
ret
Using your brain is still better though:
#[unsafe(no_mangle)]
fn sum_smart(a: u32) -> u32 {
(2 << (a - 1)) - 1
}
(5 instructions, ~11ns / characteristic iter)
sum_smart:
dec cl
mov eax, 2
shl eax, cl
dec eax
ret
pub fn bilinear_sample(a00: f64, a10: f64, a01: f64, a11: f64, fx: f64, fy: f64) -> f64 {
let inv_fx = 1.0 - fx;
let inv_fy = 1.0 - fy;
let w00 = inv_fx * inv_fy;
let w10 = fx * inv_fy;
let w01 = inv_fx * fy;
let w11 = fx * fy;
let mut result = 0.0f64;
result += a00 * w00;
result += a10 * w10;
result += a01 * w01;
result += a11 * w11;
result
}
bilinear_sample_slow:
sub rsp, 40
movaps xmmword ptr [rsp + 16], xmm7
movaps xmmword ptr [rsp], xmm6
movsd xmm4, qword ptr [rsp + 88]
movsd xmm5, qword ptr [rsp + 80]
movapd xmm6, xmm4
unpcklpd xmm6, xmm5
movapd xmm7, xmmword ptr [rip + __xmm@3ff00000000000003ff0000000000000]
subpd xmm7, xmm6
movapd xmm6, xmm7
unpckhpd xmm6, xmm7
mulsd xmm6, xmm7
mulsd xmm5, xmm4
mulsd xmm0, xmm6
xorpd xmm4, xmm4
addsd xmm0, xmm4
mulpd xmm7, xmmword ptr [rsp + 80]
unpcklpd xmm1, xmm2
mulpd xmm7, xmm1
addsd xmm0, xmm7
unpckhpd xmm7, xmm7
addsd xmm0, xmm7
mulsd xmm5, xmm3
addsd xmm0, xmm5
movaps xmm6, xmmword ptr [rsp]
movaps xmm7, xmmword ptr [rsp + 16]
add rsp, 40
ret
bilinear_sample_fast:
movsd xmm4, qword ptr [rsp + 40]
movsd xmm5, qword ptr [rip + __real@3ff0000000000000]
subsd xmm5, xmm4
movddup xmm4, xmm4
unpcklpd xmm3, xmm1
mulpd xmm4, xmm3
movddup xmm1, xmm5
unpcklpd xmm2, xmm0
mulpd xmm1, xmm2
addpd xmm1, xmm4
movapd xmm0, xmm1
unpckhpd xmm0, xmm1
subsd xmm1, xmm0
mulsd xmm1, qword ptr [rsp + 48]
addsd xmm1, xmm0
movapd xmm0, xmm1
ret
I am actually not sure how to properly test what happens on overflow, since it is literally UB now. Tell me if you have any ideas
cargo test / cargo bench will run all tests / benches - as usual
This project is licensed under the MIT license. See LICENSE for details.