| Crates.io | manifold-vectors |
| lib.rs | manifold-vectors |
| version | 0.1.0 |
| created_at | 2025-11-03 03:29:02.231278+00 |
| updated_at | 2025-11-03 03:29:02.231278+00 |
| description | Vector storage optimizations for Manifold embedded database |
| homepage | https://github.com/tomWhiting/manifold |
| repository | https://github.com/tomWhiting/manifold |
| max_upload_size | |
| id | 1913840 |
| size | 227,972 |
Vector storage optimizations for the Manifold embedded database.
manifold-vectors provides ergonomic, type-safe wrappers around Manifold's core primitives for storing and retrieving vector embeddings commonly used in ML/AI applications. It does not implement vector indexing algorithms - instead, it focuses on efficient persistent storage and provides integration traits for external libraries like instant-distance.
fixed_width() trait for direct memory-mapped access without deserialization overheadVectorSource trait for external index libraries (HNSW, FAISS, etc.)use manifold::column_family::ColumnFamilyDatabase;
use manifold_vectors::{VectorTable, VectorTableRead, distance};
// Open database and column family
let db = ColumnFamilyDatabase::open("my.db")?;
let cf = db.column_family_or_create("embeddings")?;
// Write vectors
{
let write_txn = cf.begin_write()?;
let mut vectors = VectorTable::<768>::open(&write_txn, "docs")?;
let embedding = [0.1f32; 768];
vectors.insert("doc_1", &embedding)?;
drop(vectors);
write_txn.commit()?;
}
// Read with zero-copy access - no allocations!
let read_txn = cf.begin_read()?;
let vectors = VectorTableRead::<768>::open(&read_txn, "docs")?;
if let Some(guard) = vectors.get("doc_1")? {
// guard provides zero-copy access to mmap'd data
let query = [0.1f32; 768];
let similarity = distance::cosine(guard.value(), &query);
println!("Cosine similarity: {}", similarity);
// guard dropped here - no malloc/free occurred!
}
For high-throughput vector loading, use batch operations which leverage Manifold's WAL group commit:
let items = vec![
("doc_1", [0.1f32; 768]),
("doc_2", [0.2f32; 768]),
("doc_3", [0.3f32; 768]),
];
let write_txn = cf.begin_write()?;
let mut vectors = VectorTable::<768>::open(&write_txn, "docs")?;
// Insert all vectors in one batch
vectors.insert_batch(&items, false)?;
drop(vectors);
write_txn.commit()?;
For high-dimensional sparse vectors (e.g., TF-IDF, one-hot encodings):
use manifold_vectors::sparse::{SparseVector, SparseVectorTable, SparseVectorTableRead};
// Create sparse vector (COO format: coordinate list)
let sparse = SparseVector::new(vec![
(0, 0.5), // index 0, value 0.5
(100, 0.8), // index 100, value 0.8
(1000, 0.3), // index 1000, value 0.3
]);
// Write
{
let write_txn = cf.begin_write()?;
let mut sparse_table = SparseVectorTable::open(&write_txn, "sparse")?;
sparse_table.insert("doc_1", &sparse)?;
drop(sparse_table);
write_txn.commit()?;
}
// Read
let read_txn = cf.begin_read()?;
let sparse_table = SparseVectorTableRead::open(&read_txn, "sparse")?;
let retrieved = sparse_table.get("doc_1")?.unwrap();
// Compute sparse dot product
let other = SparseVector::new(vec![(0, 1.0), (100, 0.5)]);
let dot = retrieved.dot(&other);
println!("Sparse dot product: {}", dot);
For storing multiple vectors per document (e.g., token embeddings):
use manifold_vectors::multi::{MultiVectorTable, MultiVectorTableRead};
// Each document has multiple token embeddings
let token_embeddings = vec![
[0.1f32; 128], // token 1
[0.2f32; 128], // token 2
[0.3f32; 128], // token 3
];
{
let write_txn = cf.begin_write()?;
let mut multi = MultiVectorTable::<128>::open(&write_txn, "tokens")?;
multi.insert("doc_1", &token_embeddings)?;
drop(multi);
write_txn.commit()?;
}
let read_txn = cf.begin_read()?;
let multi = MultiVectorTableRead::<128>::open(&read_txn, "tokens")?;
let tokens = multi.get("doc_1")?.unwrap();
println!("Document has {} token embeddings", tokens.len());
The crate includes common distance and similarity metrics that work directly with zero-copy VectorGuard types:
use manifold_vectors::distance;
let vec_a = [1.0, 0.0, 0.0];
let vec_b = [0.0, 1.0, 0.0];
let cosine_sim = distance::cosine(&vec_a, &vec_b); // 0.0 (orthogonal)
let euclidean = distance::euclidean(&vec_a, &vec_b); // sqrt(2)
let euclidean_sq = distance::euclidean_squared(&vec_a, &vec_b); // 2.0 (faster)
let manhattan = distance::manhattan(&vec_a, &vec_b); // 2.0
let dot = distance::dot_product(&vec_a, &vec_b); // 0.0
Dense vectors use Manifold's Value trait with fixed-width serialization:
impl Value for [f32; DIM] {
type SelfType<'a> = [f32; DIM];
type AsBytes<'a> = &'a [u8];
fn fixed_width() -> Option<usize> {
Some(DIM * 4) // 4 bytes per f32
}
// ...
}
This enables true zero-copy reads - vectors are read directly from memory-mapped pages without deserialization.
The VectorSource trait enables integration with external vector index libraries:
use manifold_vectors::VectorSource;
use instant_distance::{Builder, Search};
let read_txn = cf.begin_read()?;
let vectors = VectorTableRead::<768>::open(&read_txn, "docs")?;
// Build HNSW index from stored vectors
let mut points = Vec::new();
let mut ids = Vec::new();
for result in vectors.all_vectors()? {
let (id, guard) = result?;
points.push(instant_distance::Point::new(guard.value().to_vec()));
ids.push(id);
}
let hnsw = Builder::default().build(&points, &mut rand::rng());
// Search for nearest neighbors
let query = instant_distance::Point::new(vec![0.5f32; 768]);
let search = Search::default();
let results = hnsw.search(&query, &search);
for item in results {
println!("Similar doc: {} (distance: {})", ids[item.pid], item.distance);
}
The crate includes comprehensive examples demonstrating real-world usage:
examples/dense_semantic_search.rs)Full RAG pipeline with:
cargo run --example dense_semantic_search -p manifold-vectors
examples/sparse_hybrid_search.rs)Combines dense and sparse vectors for hybrid search:
cargo run --example sparse_hybrid_search -p manifold-vectors
examples/multi_vector_colbert.rs)Token-level embeddings for fine-grained matching:
cargo run --example multi_vector_colbert -p manifold-vectors
examples/rag_complete.rs)Production RAG implementation:
cargo run --example rag_complete -p manifold-vectors
manifold-vectors works seamlessly with other manifold domain layers in the same database:
use manifold::column_family::ColumnFamilyDatabase;
use manifold_vectors::VectorTable;
use manifold_graph::GraphTable;
use manifold_timeseries::TimeSeriesTable;
let db = ColumnFamilyDatabase::open("my_app.db")?;
// Different column families for different access patterns
let vectors_cf = db.column_family_or_create("embeddings")?;
let graph_cf = db.column_family_or_create("relationships")?;
let metrics_cf = db.column_family_or_create("usage")?;
// Store user embeddings
let txn = vectors_cf.begin_write()?;
let mut vectors = VectorTable::<512>::open(&txn, "users")?;
vectors.insert("user_1", &embedding)?;
// Store user relationships
let txn = graph_cf.begin_write()?;
let mut graph = GraphTable::open(&txn, "follows")?;
graph.add_edge(&user_1, "follows", &user_2, true, 1.0)?;
// Store user activity metrics
let txn = metrics_cf.begin_write()?;
let mut ts = TimeSeriesTable::open(&txn, "activity")?;
ts.write("user_1.logins", timestamp, 1.0)?;
manifold version 3.1+sorted: true - saves sorting overhead.value().to_vec() unless neededLicensed under either of:
at your option.
Contributions are welcome! This crate follows the patterns established in the manifold domain layer architecture.