| Crates.io | voirs-cloning |
| lib.rs | voirs-cloning |
| version | 0.1.0-alpha.1 |
| created_at | 2025-09-21 04:15:16.059389+00 |
| updated_at | 2025-09-21 04:15:16.059389+00 |
| description | Voice cloning and speaker adaptation system for VoiRS |
| homepage | https://github.com/cool-japan/voirs |
| repository | https://github.com/cool-japan/voirs |
| max_upload_size | |
| id | 1848424 |
| size | 2,611,606 |
Advanced Voice Cloning and Speaker Adaptation System
This crate provides comprehensive voice cloning capabilities including few-shot speaker adaptation, speaker verification, voice similarity measurement, and cross-language cloning.
use voirs_cloning::prelude::*;
#[tokio::main]
async fn main() -> Result<()> {
// Load audio samples for cloning
let voice_samples = vec![
VoiceSample::from_file("speaker1_sample1.wav").await?,
VoiceSample::from_file("speaker1_sample2.wav").await?,
VoiceSample::from_file("speaker1_sample3.wav").await?,
];
// Create voice cloner
let cloner = VoiceCloner::builder()
.with_method(CloningMethod::FewShot)
.with_quality_threshold(0.8)
.build().await?;
// Clone the voice
let request = VoiceCloneRequest::new(voice_samples)
.with_target_text("Hello, this is a test of voice cloning.")
.with_adaptation_steps(100);
let result = cloner.clone_voice(request).await?;
// Save cloned voice model
result.save_model("cloned_voice.bin").await?;
// Evaluate cloning quality
let quality = result.quality_metrics();
println!("Cloning quality: {:.2}", quality.overall_score);
Ok(())
}
use voirs_cloning::prelude::*;
// Create embedding extractor
let extractor = SpeakerEmbeddingExtractor::new().await?;
// Extract embeddings from audio
let audio_data = load_audio("speaker.wav").await?;
let embedding = extractor.extract_embedding(&audio_data).await?;
// Compare with another speaker
let other_embedding = extractor.extract_embedding(&other_audio).await?;
let similarity = embedding.cosine_similarity(&other_embedding);
println!("Speaker similarity: {:.3}", similarity);
use voirs_cloning::prelude::*;
use tokio::time::{interval, Duration};
#[tokio::main]
async fn main() -> Result<()> {
let cloner = VoiceCloner::builder()
.with_real_time_adaptation()
.build().await?;
let mut adaptation_buffer = Vec::new();
let mut timer = interval(Duration::from_secs(5));
loop {
timer.tick().await;
// Collect new audio samples
if let Ok(new_sample) = collect_audio_sample().await {
adaptation_buffer.push(new_sample);
// Adapt voice model with new data
if adaptation_buffer.len() >= 3 {
let adapted_model = cloner
.adapt_voice(&adaptation_buffer)
.await?;
update_synthesis_model(adapted_model).await?;
adaptation_buffer.clear();
}
}
}
}
use voirs_cloning::prelude::*;
// Analyze voice characteristics
let analyzer = VoiceAnalyzer::new().await?;
let audio = load_audio("speaker.wav").await?;
let characteristics = analyzer.analyze_voice(&audio).await?;
println!("Fundamental frequency: {:.1} Hz", characteristics.f0_mean);
println!("Pitch range: {:.1} semitones", characteristics.pitch_range);
println!("Speaking rate: {:.1} words/min", characteristics.speaking_rate);
println!("Voice quality: {:?}", characteristics.voice_quality);
use voirs_cloning::similarity::*;
// Create similarity measurer
let measurer = SimilarityMeasurer::new()
.with_perceptual_weighting(0.6)
.with_embedding_weighting(0.4)
.build()?;
// Measure similarity between voices
let similarity_score = measurer.measure_similarity(
&original_voice,
&cloned_voice,
).await?;
println!("Perceptual similarity: {:.3}", similarity_score.perceptual);
println!("Embedding similarity: {:.3}", similarity_score.embedding);
println!("Overall similarity: {:.3}", similarity_score.overall);
use voirs_cloning::types::*;
// Configure different cloning approaches
let few_shot_config = CloningConfig::builder()
.method(CloningMethod::FewShot {
min_samples: 3,
max_samples: 10,
adaptation_steps: 200,
})
.quality_threshold(0.85)
.build()?;
let zero_shot_config = CloningConfig::builder()
.method(CloningMethod::ZeroShot {
embedding_similarity_threshold: 0.9,
})
.enable_cross_lingual(true)
.build()?;
use voirs_cloning::types::*;
// Create comprehensive speaker profile
let profile = SpeakerProfile::builder()
.with_name("Speaker 1")
.with_samples(voice_samples)
.with_metadata(SpeakerMetadata {
age_range: Some(25..35),
gender: Some(Gender::Female),
accent: Some("American English".to_string()),
languages: vec!["en-US".to_string()],
})
.with_embedding(speaker_embedding)
.build()?;
use voirs_cloning::prelude::*;
// Clone voice across languages
let cloner = VoiceCloner::builder()
.with_cross_lingual_support()
.with_phoneme_mapping("en-US", "ja-JP")
.build().await?;
// English source, Japanese target
let english_samples = load_english_samples().await?;
let japanese_text = "ใใใซใกใฏใใใใฏ้ณๅฃฐใฏใญใผใณใฎใในใใงใใ";
let result = cloner.clone_cross_lingual(
english_samples,
japanese_text,
"ja-JP"
).await?;
use voirs_cloning::morphing::*;
// Morph between multiple speakers
let morpher = VoiceMorpher::new();
let speaker_a = load_speaker_profile("speaker_a.json").await?;
let speaker_b = load_speaker_profile("speaker_b.json").await?;
// Create morphed voice (30% A, 70% B)
let morphed_profile = morpher.morph_voices(
&[(speaker_a, 0.3), (speaker_b, 0.7)]
).await?;
use voirs_cloning::quality::*;
// Assess cloning quality
let assessor = CloningQualityAssessor::new().await?;
let metrics = assessor.assess_quality(
&original_audio,
&cloned_audio,
&reference_text
).await?;
println!("Naturalness: {:.2}", metrics.naturalness);
println!("Speaker similarity: {:.2}", metrics.speaker_similarity);
println!("Intelligibility: {:.2}", metrics.intelligibility);
println!("Audio quality: {:.2}", metrics.audio_quality);
use voirs_cloning::ethics::*;
// Require explicit consent for voice cloning
let consent_manager = ConsentManager::new();
// Verify consent before cloning
let consent_token = consent_manager.verify_consent(
&speaker_identity,
&voice_samples,
ConsentLevel::FullCloning
).await?;
let cloner = VoiceCloner::builder()
.with_consent_requirement(consent_token)
.with_usage_tracking(true)
.build().await?;
use voirs_cloning::monitoring::*;
// Track voice cloning usage
let monitor = UsageMonitor::new()
.with_audit_logging(true)
.with_anomaly_detection(true)
.build()?;
// Log cloning activity
monitor.log_cloning_activity(CloningActivity {
speaker_id: "speaker_123".to_string(),
target_text: text.clone(),
quality_score: result.quality_score,
timestamp: Utc::now(),
usage_purpose: UsagePurpose::PersonalAssistant,
}).await?;
| Operation | Time | Memory | GPU Memory | Notes |
|---|---|---|---|---|
| Embedding Extraction | 150ms | 500MB | 2GB | Per 10s audio |
| Few-shot Adaptation | 2.5min | 2GB | 8GB | 5 samples, 200 steps |
| Real-time Synthesis | 0.1ร RTF | 1GB | 4GB | With cloned voice |
| Similarity Calculation | 50ms | 200MB | 1GB | Embedding comparison |
| Quality Assessment | 800ms | 800MB | 3GB | Comprehensive metrics |
use voirs_cloning::config::*;
// Performance-optimized configuration
let config = CloningConfig::builder()
.with_performance_mode(PerformanceMode::Fast)
.with_batch_size(16)
.with_gpu_optimization(true)
.with_memory_limit(4_000_000_000) // 4GB
.build()?;
// Quality-optimized configuration
let config = CloningConfig::builder()
.with_performance_mode(PerformanceMode::HighQuality)
.with_adaptation_steps(500)
.with_quality_threshold(0.95)
.build()?;
# Run voice cloning tests
cargo test --package voirs-cloning
# Run similarity measurement tests
cargo test --package voirs-cloning similarity
# Run quality assessment tests
cargo test --package voirs-cloning quality
# Run cross-lingual cloning tests
cargo test --package voirs-cloning cross_lingual
# Run performance benchmarks
cargo bench --package voirs-cloning
use voirs_cloning::acoustic::*;
// Integrate with acoustic models
let acoustic_adapter = AcousticModelAdapter::new();
let adapted_model = acoustic_adapter
.adapt_model(&base_model, &speaker_embedding)
.await?;
See the examples/ directory for comprehensive usage examples:
voice_cloning_example.rs - Basic voice cloningspeaker_similarity.rs - Similarity measurementreal_time_adaptation.rs - Live adaptationcross_lingual_cloning.rs - Multi-language cloningLicensed under either of Apache License 2.0 or MIT License at your option.
Part of the VoiRS neural speech synthesis ecosystem.