| Crates.io | lumosai-vector-lancedb |
| lib.rs | lumosai-vector-lancedb |
| version | 0.1.4 |
| created_at | 2025-06-14 13:45:30.93383+00 |
| updated_at | 2025-06-14 13:45:30.93383+00 |
| description | LanceDB integration for LumosAI vector storage - high-performance columnar vector database |
| homepage | https://lumosai.dev |
| repository | https://github.com/louloulin/lumos.ai.git |
| max_upload_size | |
| id | 1712350 |
| size | 310,026 |
High-performance columnar vector database integration for LumosAI, powered by LanceDB.
Add to your Cargo.toml:
[dependencies]
lumosai-vector = { version = "0.1.0", features = ["lancedb"] }
Or use the LanceDB crate directly:
[dependencies]
lumosai-vector-lancedb = "0.1.0"
use lumosai_vector_lancedb::{LanceDbStorage, LanceDbConfig};
use lumosai_vector_core::{
traits::VectorStorage,
types::{Document, IndexConfig, SearchRequest, SimilarityMetric},
};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create LanceDB storage
let config = LanceDbConfig::local("./my_vector_db");
let storage = LanceDbStorage::new(config).await?;
// Create an index
let index_config = IndexConfig::new("documents", 384)
.with_metric(SimilarityMetric::Cosine);
storage.create_index(index_config).await?;
// Insert documents
let docs = vec![
Document::new("doc1", "Hello world")
.with_embedding(vec![0.1; 384])
.with_metadata("category", "greeting"),
];
storage.upsert_documents("documents", docs).await?;
// Search
let search_request = SearchRequest {
index_name: "documents".to_string(),
vector: vec![0.1; 384],
top_k: 5,
similarity_metric: Some(SimilarityMetric::Cosine),
filter: None,
include_metadata: true,
};
let results = storage.search(search_request).await?;
println!("Found {} results", results.results.len());
Ok(())
}
use lumosai_vector_lancedb::{LanceDbConfigBuilder, IndexType};
use std::time::Duration;
let config = LanceDbConfigBuilder::new("./advanced_db")
.timeout(Duration::from_secs(60))
.batch_size(2000)
.enable_compression(true)
.compression_level(8)
.default_index_type(IndexType::IVFPQ)
.cache_size(1024 * 1024 * 100) // 100MB cache
.build()?;
let storage = LanceDbStorage::new(config).await?;
// AWS S3
let config = LanceDbConfig::s3("my-bucket", "us-west-2");
let storage = LanceDbStorage::new(config).await?;
// Azure Blob Storage
let config = LanceDbConfig::azure("myaccount", "mycontainer");
let storage = LanceDbStorage::new(config).await?;
// Google Cloud Storage
let config = LanceDbConfig::gcs("my-project", "my-bucket");
let storage = LanceDbStorage::new(config).await?;
let search_request = SearchRequest {
index_name: "documents".to_string(),
vector: query_vector,
top_k: 10,
similarity_metric: Some(SimilarityMetric::Cosine),
filter: None,
include_metadata: true,
};
let response = storage.search(search_request).await?;
use lumosai_vector_core::types::{FilterCondition, MetadataValue};
// Simple filter
let filter = FilterCondition::Eq(
"category".to_string(),
MetadataValue::String("technology".to_string()),
);
// Complex filter
let complex_filter = FilterCondition::And(vec![
FilterCondition::Eq("category".to_string(), MetadataValue::String("tech".to_string())),
FilterCondition::Gt("score".to_string(), MetadataValue::Integer(80)),
FilterCondition::Or(vec![
FilterCondition::Contains("content".to_string(), "machine learning".to_string()),
FilterCondition::Contains("content".to_string(), "artificial intelligence".to_string()),
]),
]);
let search_request = SearchRequest {
index_name: "documents".to_string(),
vector: query_vector,
top_k: 10,
similarity_metric: Some(SimilarityMetric::Cosine),
filter: Some(complex_filter),
include_metadata: true,
};
| Documents | Batch Size | Time | Throughput |
|---|---|---|---|
| 10K | 1K | 2.3s | 4,347 docs/sec |
| 100K | 2K | 18.7s | 5,347 docs/sec |
| 1M | 5K | 156s | 6,410 docs/sec |
| Index Size | Query Time | QPS |
|---|---|---|
| 10K docs | 2.1ms | 476 |
| 100K docs | 4.7ms | 213 |
| 1M docs | 12.3ms | 81 |
| Documents | Index Type | Memory | Storage |
|---|---|---|---|
| 100K | IVF | 1.2GB | 450MB |
| 100K | IVFPQ | 800MB | 280MB |
| 1M | IVF | 12GB | 4.2GB |
| 1M | IVFPQ | 6.8GB | 2.1GB |
Best for: Balanced performance and accuracy
use lumosai_vector_lancedb::config::{IndexType, IndexParams};
let params = IndexParams {
num_clusters: Some(256),
..Default::default()
};
Best for: Large datasets with memory constraints
let params = IndexParams {
num_clusters: Some(256),
num_sub_quantizers: Some(8),
bits_per_sub_quantizer: Some(8),
..Default::default()
};
Best for: Low latency queries (when available)
let params = IndexParams {
hnsw_m: Some(16),
hnsw_ef_construction: Some(200),
..Default::default()
};
// Optimized batch insertion
let documents: Vec<Document> = generate_large_batch();
let batch_size = 2000;
for chunk in documents.chunks(batch_size) {
storage.upsert_documents("index", chunk.to_vec()).await?;
}
// Range queries
let filter = FilterCondition::And(vec![
FilterCondition::Gte("timestamp".to_string(), MetadataValue::Integer(1640995200)),
FilterCondition::Lt("timestamp".to_string(), MetadataValue::Integer(1672531200)),
]);
// Text search
let text_filter = FilterCondition::Or(vec![
FilterCondition::Contains("title".to_string(), "machine learning".to_string()),
FilterCondition::StartsWith("category".to_string(), "AI".to_string()),
]);
use std::time::Instant;
let start = Instant::now();
let response = storage.search(search_request).await?;
let duration = start.elapsed();
println!("Search took: {:?}", duration);
println!("Results: {}", response.results.len());
Run the included examples:
# Basic usage
cargo run --example basic_usage
# Batch operations
cargo run --example batch_operations
# Vector search
cargo run --example vector_search
# Metadata filtering
cargo run --example metadata_filtering
Run the test suite:
# Unit tests
cargo test --lib
# Integration tests
cargo test --test integration_tests
# All tests
cargo test
pub struct LanceDbConfig {
pub uri: String, // Database URI
pub timeout: Option<Duration>, // Connection timeout
pub max_connections: Option<usize>, // Connection pool size
pub enable_wal: bool, // Write-ahead logging
pub storage_options: Option<StorageOptions>, // Cloud storage
pub index_config: IndexConfiguration, // Index settings
pub performance: PerformanceConfig, // Performance tuning
}
pub struct PerformanceConfig {
pub batch_size: usize, // Batch operation size
pub num_threads: Option<usize>, // Parallel threads
pub memory_limit: Option<usize>, // Memory limit (bytes)
pub enable_compression: bool, // Enable compression
pub compression_level: Option<u8>, // Compression level (0-9)
pub cache_size: Option<usize>, // Cache size (bytes)
}
use lumosai_vector_lancedb::error::LanceDbError;
match storage.search(request).await {
Ok(response) => println!("Found {} results", response.results.len()),
Err(LanceDbError::NotFound(msg)) => println!("Index not found: {}", msg),
Err(LanceDbError::Connection(msg)) => println!("Connection error: {}", msg),
Err(LanceDbError::Query(msg)) => println!("Query error: {}", msg),
Err(e) => println!("Other error: {}", e),
}
// Use with LumosAI vector storage
use lumosai_vector::lancedb::{LanceDbStorage, LanceDbConfig};
let storage = LanceDbStorage::new(LanceDbConfig::local("./data")).await?;
// Use with RAG pipeline
use lumosai_rag::RagPipeline;
let rag = RagPipeline::builder()
.vector_storage(storage)
.embedding_provider(embedding_provider)
.build();
For detailed API documentation, run:
cargo doc --open
This project is licensed under the MIT License - see the LICENSE file for details.