| Crates.io | woolly-tensor |
| lib.rs | woolly-tensor |
| version | 0.1.3 |
| created_at | 2025-06-26 16:51:39.72501+00 |
| updated_at | 2025-06-26 18:47:52.841644+00 |
| description | Tensor operations library for Woolly LLM |
| homepage | https://github.com/Enreign/woolly |
| repository | https://github.com/Enreign/woolly |
| max_upload_size | |
| id | 1727559 |
| size | 349,595 |
High-performance tensor operations library for Woolly, providing efficient mathematical operations with support for multiple backends including CPU (SIMD-accelerated), CUDA, and Metal.
Add to your Cargo.toml:
[dependencies]
woolly-tensor = "0.1"
use woolly_tensor::prelude::*;
fn main() -> Result<(), TensorError> {
// Create a CPU backend
let backend = CpuBackend::new();
// Create tensors
let shape = Shape::matrix(3, 4);
let a = Tensor::zeros(&backend, shape.clone(), DType::F32)?;
let b = Tensor::ones(&backend, shape, DType::F32)?;
// Perform operations
let c = a.add(&b)?;
let d = c.mul_scalar(2.0)?;
// Access data
let data = d.to_vec()?;
println!("Result: {:?}", data);
Ok(())
}
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
fn main() -> Result<(), TensorError> {
let backend = CpuBackend::new();
// Create matrices
let a = Tensor::from_data(&backend, vec![1.0, 2.0, 3.0, 4.0], Shape::matrix(2, 2), DType::F32)?;
let b = Tensor::from_data(&backend, vec![5.0, 6.0, 7.0, 8.0], Shape::matrix(2, 2), DType::F32)?;
// Matrix multiplication
let c = MatMul::apply(&a, &b)?;
// Element-wise operations
let d = Add::apply(&a, &b)?;
let e = ReLU::apply(&d)?;
println!("Matrix multiplication result: {:?}", c.to_vec()?);
println!("Element-wise addition + ReLU: {:?}", e.to_vec()?);
Ok(())
}
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
fn main() -> Result<(), TensorError> {
let backend = CpuBackend::new();
// Large arrays benefit from SIMD acceleration
let size = 1024 * 1024;
let data_a: Vec<f32> = (0..size).map(|i| i as f32).collect();
let data_b: Vec<f32> = (0..size).map(|i| (i * 2) as f32).collect();
let shape = Shape::vector(size);
let a = Tensor::from_data(&backend, data_a, shape.clone(), DType::F32)?;
let b = Tensor::from_data(&backend, data_b, shape, DType::F32)?;
// SIMD-accelerated operations
let start = std::time::Instant::now();
let result = Add::apply(&a, &b)?;
let duration = start.elapsed();
println!("SIMD addition of {} elements took: {:?}", size, duration);
Ok(())
}
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
// From Vec
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let tensor = Tensor::from_data(&backend, data, Shape::matrix(2, 3), DType::F32)?;
// From slice
let data = &[1.0f32, 2.0, 3.0, 4.0];
let tensor = Tensor::from_slice(&backend, data, Shape::vector(4), DType::F32)?;
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
let shape = Shape::from_slice(&[2, 3, 4]);
// Common initializations
let zeros = Tensor::zeros(&backend, shape.clone(), DType::F32)?;
let ones = Tensor::ones(&backend, shape.clone(), DType::F32)?;
let random = Tensor::randn(&backend, shape.clone(), DType::F32)?;
// Custom initialization
let tensor = Tensor::full(&backend, shape, 3.14, DType::F32)?;
// Range tensor
let range = Tensor::arange(&backend, 0.0, 10.0, 1.0, DType::F32)?;
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
let tensor = Tensor::arange(&backend, 0.0, 24.0, 1.0, DType::F32)?;
// Reshape
let reshaped = tensor.reshape(&Shape::from_slice(&[2, 3, 4]))?;
// Transpose
let transposed = reshaped.transpose(&[2, 0, 1])?;
// Squeeze and unsqueeze
let squeezed = transposed.squeeze()?;
let unsqueezed = squeezed.unsqueeze(1)?;
// Slice
let sliced = tensor.slice(&[0..12])?;
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
let backend = CpuBackend::new();
let tensor = Tensor::from_data(&backend, vec![-2.0, -1.0, 0.0, 1.0, 2.0], Shape::vector(5), DType::F32)?;
// Activation functions
let relu = ReLU::apply(&tensor)?;
let sigmoid = Sigmoid::apply(&tensor)?;
let tanh = Tanh::apply(&tensor)?;
let gelu = GELU::apply(&tensor)?;
// Mathematical functions
let abs = Abs::apply(&tensor)?;
let exp = Exp::apply(&tensor)?;
let log = Log::apply(&tensor)?;
let sqrt = Sqrt::apply(&tensor)?;
// Trigonometric functions
let sin = Sin::apply(&tensor)?;
let cos = Cos::apply(&tensor)?;
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
let backend = CpuBackend::new();
let a = Tensor::ones(&backend, Shape::matrix(3, 3), DType::F32)?;
let b = Tensor::full(&backend, Shape::matrix(3, 3), 2.0, DType::F32)?;
// Arithmetic operations
let add = Add::apply(&a, &b)?;
let sub = Sub::apply(&a, &b)?;
let mul = Mul::apply(&a, &b)?;
let div = Div::apply(&a, &b)?;
// Comparison operations
let eq = Equal::apply(&a, &b)?;
let gt = Greater::apply(&a, &b)?;
let lt = Less::apply(&a, &b)?;
// Logical operations
let and = And::apply(&a, &b)?;
let or = Or::apply(&a, &b)?;
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
let backend = CpuBackend::new();
let tensor = Tensor::from_data(&backend, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], Shape::matrix(2, 3), DType::F32)?;
// Sum operations
let sum_all = Sum::apply(&tensor, None)?; // Sum all elements
let sum_dim0 = Sum::apply(&tensor, Some(0))?; // Sum along dimension 0
let sum_dim1 = Sum::apply(&tensor, Some(1))?; // Sum along dimension 1
// Other reductions
let mean = Mean::apply(&tensor, Some(1))?;
let max = Max::apply(&tensor, Some(0))?;
let min = Min::apply(&tensor, Some(0))?;
let argmax = ArgMax::apply(&tensor, 1)?;
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
let backend = CpuBackend::new();
// Matrix multiplication
let a = Tensor::randn(&backend, Shape::matrix(64, 128), DType::F32)?;
let b = Tensor::randn(&backend, Shape::matrix(128, 256), DType::F32)?;
let c = MatMul::apply(&a, &b)?;
// Batch matrix multiplication
let a_batch = Tensor::randn(&backend, Shape::from_slice(&[32, 64, 128]), DType::F32)?;
let b_batch = Tensor::randn(&backend, Shape::from_slice(&[32, 128, 256]), DType::F32)?;
let c_batch = BatchMatMul::apply(&a_batch, &b_batch)?;
// Vector operations
let v1 = Tensor::randn(&backend, Shape::vector(1024), DType::F32)?;
let v2 = Tensor::randn(&backend, Shape::vector(1024), DType::F32)?;
let dot_product = Dot::apply(&v1, &v2)?;
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
let tensor = Tensor::randn(&backend, Shape::matrix(1024, 1024), DType::F32)?;
// Different quantization methods
let q4_0_quantizer = Q4_0Quantizer::new();
let q4_1_quantizer = Q4_1Quantizer::new();
let q8_0_quantizer = Q8_0Quantizer::new();
let int8_quantizer = Int8Quantizer::new();
// Quantize tensor
let quantized_q4_0 = q4_0_quantizer.quantize(&tensor)?;
let quantized_q8_0 = q8_0_quantizer.quantize(&tensor)?;
// Dequantize back to float
let dequantized = q4_0_quantizer.dequantize(&quantized_q4_0)?;
println!("Original size: {} bytes", tensor.size_in_bytes());
println!("Q4_0 size: {} bytes", quantized_q4_0.size_in_bytes());
println!("Compression ratio: {:.2}x",
tensor.size_in_bytes() as f32 / quantized_q4_0.size_in_bytes() as f32);
use woolly_tensor::prelude::*;
struct CustomQuantizer {
scale: f32,
zero_point: i32,
}
impl Quantizer for CustomQuantizer {
type QuantizedType = i8;
fn quantize(&self, tensor: &Tensor) -> Result<Tensor, TensorError> {
// Implement custom quantization logic
todo!()
}
fn dequantize(&self, quantized: &Tensor) -> Result<Tensor, TensorError> {
// Implement custom dequantization logic
todo!()
}
}
use woolly_tensor::prelude::*;
// Create CPU backend with automatic SIMD detection
let backend = CpuBackend::new();
// Check available SIMD features
println!("AVX2 support: {}", backend.has_avx2());
println!("AVX-512 support: {}", backend.has_avx512());
println!("NEON support: {}", backend.has_neon());
// Configure thread pool
let backend = CpuBackend::with_threads(8);
#[cfg(feature = "cuda")]
use woolly_tensor::prelude::*;
#[cfg(feature = "cuda")]
{
// Create CUDA backend
let backend = CudaBackend::new(0)?; // GPU device 0
// Check device properties
println!("Device name: {}", backend.device_name());
println!("Memory: {} MB", backend.total_memory() / 1024 / 1024);
println!("Compute capability: {:?}", backend.compute_capability());
// Create tensors on GPU
let tensor = Tensor::randn(&backend, Shape::matrix(1024, 1024), DType::F32)?;
// Operations run on GPU
let result = tensor.relu()?;
// Copy to CPU if needed
let cpu_backend = CpuBackend::new();
let cpu_tensor = result.to_backend(&cpu_backend)?;
}
#[cfg(feature = "metal")]
use woolly_tensor::prelude::*;
#[cfg(feature = "metal")]
{
// Create Metal backend
let backend = MetalBackend::new()?;
// Check device properties
println!("Device name: {}", backend.device_name());
println!("Unified memory: {}", backend.has_unified_memory());
// Operations use Metal Performance Shaders
let tensor = Tensor::randn(&backend, Shape::matrix(2048, 2048), DType::F32)?;
let result = tensor.matmul(&tensor.transpose(&[1, 0])?)?;
}
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
// Row-major (C-style) layout - default
let row_major = Tensor::zeros(&backend, Shape::matrix(1000, 1000), DType::F32)?;
// Column-major (Fortran-style) layout
let col_major = row_major.transpose(&[1, 0])?;
// Contiguous memory access patterns are faster
let contiguous = col_major.contiguous()?;
use woolly_tensor::prelude::*;
use woolly_tensor::ops::simd::*;
// Direct SIMD operations for maximum performance
let a = vec![1.0f32; 1024];
let b = vec![2.0f32; 1024];
let mut result = vec![0.0f32; 1024];
// Use SIMD directly
#[cfg(target_arch = "x86_64")]
{
if std::arch::is_x86_feature_detected!("avx2") {
unsafe {
avx2_add_f32(&a, &b, &mut result);
}
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
neon_add_f32(&a, &b, &mut result);
}
}
use woolly_tensor::prelude::*;
let backend = CpuBackend::with_threads(8);
// Process data in batches for better cache utilization
let batch_size = 64;
let total_samples = 10000;
for batch_start in (0..total_samples).step_by(batch_size) {
let batch_end = (batch_start + batch_size).min(total_samples);
let batch_data = &data[batch_start..batch_end];
let batch_tensor = Tensor::from_slice(&backend, batch_data,
Shape::matrix(batch_end - batch_start, feature_size),
DType::F32)?;
// Process batch
let result = model.forward(&batch_tensor)?;
}
use woolly_tensor::prelude::*;
use woolly_tensor::ops::*;
struct CustomSwish;
impl UnaryOp for CustomSwish {
fn apply_f32(input: &[f32], output: &mut [f32]) -> Result<(), TensorError> {
for (i, &x) in input.iter().enumerate() {
output[i] = x / (1.0 + (-x).exp());
}
Ok(())
}
fn apply_f16(input: &[f16], output: &mut [f16]) -> Result<(), TensorError> {
// F16 implementation
todo!()
}
}
// Use custom operation
let tensor = Tensor::randn(&backend, Shape::vector(1000), DType::F32)?;
let result = CustomSwish::apply(&tensor)?;
use woolly_tensor::prelude::*;
let backend = CpuBackend::new();
// Pre-allocate storage for better performance
let storage = TensorStorage::allocate(&backend, 1024 * 1024, DType::F32)?;
let tensor = Tensor::from_storage(storage, Shape::matrix(1024, 1024))?;
// Memory-mapped tensors for large datasets
let mmap_tensor = Tensor::from_file(&backend, "large_weights.bin",
Shape::from_slice(&[10000, 4096]),
DType::F32)?;
// Shared memory between tensors (zero-copy views)
let view = tensor.view(&[0..512, 0..512])?;
let slice = tensor.slice(&[100..200, 50..150])?;
use woolly_tensor::prelude::*;
match tensor_operation() {
Ok(result) => println!("Success: {:?}", result.shape()),
Err(TensorError::ShapeMismatch { expected, actual }) => {
eprintln!("Shape mismatch: expected {:?}, got {:?}", expected, actual);
}
Err(TensorError::InvalidDType { expected, actual }) => {
eprintln!("Type mismatch: expected {:?}, got {:?}", expected, actual);
}
Err(TensorError::OutOfMemory { requested, available }) => {
eprintln!("Out of memory: requested {} bytes, {} available", requested, available);
}
Err(TensorError::BackendError(msg)) => {
eprintln!("Backend error: {}", msg);
}
Err(e) => {
eprintln!("Other error: {}", e);
}
}
# Run all benchmarks
cargo bench
# Run specific benchmark category
cargo bench tensor_ops
cargo bench matmul
cargo bench quantization
# Compare backends
cargo bench --features="cuda,metal"
Example benchmark results:
Matrix Multiplication (1024x1024):
CPU (AVX2): 45.2 ms
CUDA (RTX 4090): 2.3 ms
Metal (M2 Max): 3.1 ms
Element-wise Add (1M elements):
CPU (SIMD): 0.8 ms
CPU (scalar): 3.2 ms
CUDA: 0.1 ms
cuda: Enable NVIDIA CUDA supportmetal: Enable Apple Metal supportmkl: Intel Math Kernel Library integrationblas: Generic BLAS integrationbenchmarks: Include benchmarking utilitiesWe welcome contributions! Please see the Contributing Guide for details.
Licensed under either of:
at your option.