| Crates.io | voirs-acoustic |
| lib.rs | voirs-acoustic |
| version | 0.1.0-alpha.1 |
| created_at | 2025-09-21 04:10:32.813108+00 |
| updated_at | 2025-09-21 04:10:32.813108+00 |
| description | Acoustic model inference for VoiRS speech synthesis (VITS/FastSpeech2) |
| homepage | https://github.com/cool-japan/voirs |
| repository | https://github.com/cool-japan/voirs |
| max_upload_size | |
| id | 1848421 |
| size | 2,079,051 |
Neural acoustic modeling for VoiRS speech synthesis - converts phonemes to mel spectrograms.
This crate implements state-of-the-art neural acoustic models including VITS (Variational Inference Text-to-Speech) and FastSpeech2. It serves as the core component in the VoiRS pipeline, transforming phonetic representations into mel spectrograms that can be converted to audio by vocoders.
use voirs_acoustic::{VitsModel, AcousticModel, MelSpectrogram};
use voirs_g2p::Phoneme;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Load pre-trained VITS model
let model = VitsModel::from_pretrained("vits-en-us-female").await?;
// Convert phonemes to mel spectrogram
let phonemes = vec![/* phonemes from G2P */];
let mel: MelSpectrogram = model.synthesize(&phonemes, None).await?;
// Mel spectrogram is ready for vocoder
println!("Generated mel: {}x{}", mel.n_mels(), mel.n_frames());
Ok(())
}
| Model | Type | Quality (MOS) | Speed (RTF) | Size | Status |
|---|---|---|---|---|---|
| VITS-EN-US | VITS | 4.42 | 0.28× | 89MB | ✅ Stable |
| VITS-EN-UK | VITS | 4.38 | 0.28× | 89MB | ✅ Stable |
| VITS-JP | VITS | 4.35 | 0.31× | 92MB | ✅ Stable |
| VITS-Multilingual | VITS | 4.15 | 0.35× | 156MB | 🚧 Beta |
| FastSpeech2-EN | Non-AR | 4.21 | 0.15× | 67MB | 🚧 Beta |
Phonemes → Text Encoder → Posterior Encoder → Flow → Decoder → Mel Spectrogram
↓ ↓ ↓ ↓ ↓ ↓
[P, H, OW] Transformer CNN Features Flows CNN Gen [80, 256]
Text Encoder
Posterior Encoder (VITS only)
Normalizing Flows (VITS only)
Decoder/Generator
#[async_trait]
pub trait AcousticModel: Send + Sync {
/// Generate mel spectrogram from phonemes
async fn synthesize(
&self,
phonemes: &[Phoneme],
config: Option<&SynthesisConfig>
) -> Result<MelSpectrogram>;
/// Batch synthesis for multiple inputs
async fn synthesize_batch(
&self,
inputs: &[&[Phoneme]],
configs: Option<&[SynthesisConfig]>
) -> Result<Vec<MelSpectrogram>>;
/// Get model metadata and capabilities
fn metadata(&self) -> ModelMetadata;
/// Check if model supports specific features
fn supports(&self, feature: ModelFeature) -> bool;
}
pub struct VitsModel {
text_encoder: TextEncoder,
posterior_encoder: PosteriorEncoder,
decoder: Decoder,
flow: NormalizingFlows,
device: Device,
config: VitsConfig,
}
impl VitsModel {
/// Load pre-trained model from HuggingFace Hub
pub async fn from_pretrained(model_id: &str) -> Result<Self>;
/// Load model from local files
pub async fn from_files(
config_path: &Path,
weights_path: &Path
) -> Result<Self>;
/// Generate with speaker control
pub async fn synthesize_with_speaker(
&self,
phonemes: &[Phoneme],
speaker_id: Option<u32>,
emotion: Option<EmotionVector>
) -> Result<MelSpectrogram>;
}
#[derive(Debug, Clone)]
pub struct MelSpectrogram {
/// Mel filterbank features [n_mels, n_frames]
data: Tensor,
/// Sample rate in Hz
sample_rate: u32,
/// Hop length in samples
hop_length: u32,
/// Number of mel channels
n_mels: u32,
}
impl MelSpectrogram {
/// Get mel values as ndarray for processing
pub fn to_array(&self) -> Array2<f32>;
/// Convert to raw tensor for vocoder input
pub fn to_tensor(&self) -> &Tensor;
/// Get duration in seconds
pub fn duration(&self) -> f32;
/// Visualize mel spectrogram (feature: "plotting")
#[cfg(feature = "plotting")]
pub fn plot(&self) -> Plot;
}
use voirs_acoustic::{VitsModel, AcousticModel};
let model = VitsModel::from_pretrained("vits-en-us-female").await?;
// Simple synthesis
let phonemes = vec![/* phonemes from G2P */];
let mel = model.synthesize(&phonemes, None).await?;
use voirs_acoustic::{VitsModel, SynthesisConfig, SpeakerConfig};
let model = VitsModel::from_pretrained("vits-multilingual").await?;
let config = SynthesisConfig {
speaker: Some(SpeakerConfig {
speaker_id: Some(42),
emotion: Some(EmotionVector::happy(0.8)),
age: Some(25.0),
gender: Some(Gender::Female),
}),
..Default::default()
};
let mel = model.synthesize(&phonemes, Some(&config)).await?;
use voirs_acoustic::{ProsodyConfig, DurationControl, PitchControl};
let config = SynthesisConfig {
prosody: Some(ProsodyConfig {
speaking_rate: 1.2, // 20% faster
pitch_shift: 0.1, // 10% higher pitch
energy_scale: 1.1, // 10% more energy
duration_control: DurationControl::Predictive,
pitch_control: PitchControl::Neural,
}),
..Default::default()
};
let mel = model.synthesize(&phonemes, Some(&config)).await?;
use voirs_acoustic::{StreamingVits, StreamConfig};
use futures::StreamExt;
let model = StreamingVits::from_pretrained("vits-en-us-female").await?;
let stream_config = StreamConfig {
chunk_size: 256, // frames per chunk
overlap: 64, // overlap between chunks
max_latency_ms: 50, // maximum acceptable latency
};
let mut stream = model.synthesize_stream(&phonemes, stream_config).await?;
while let Some(mel_chunk) = stream.next().await {
let chunk = mel_chunk?;
// Process chunk immediately for low latency
send_to_vocoder(chunk).await?;
}
use voirs_acoustic::{BatchProcessor, BatchConfig};
let processor = BatchProcessor::new(model, BatchConfig {
max_batch_size: 16,
max_sequence_length: 1000,
padding_strategy: PaddingStrategy::Longest,
});
let phoneme_batches: Vec<Vec<Phoneme>> = load_phoneme_data()?;
let mel_batches = processor.process_batches(&phoneme_batches).await?;
use voirs_acoustic::{VitsConfig, ModelLoader};
// Load from custom configuration
let config = VitsConfig {
text_encoder: TextEncoderConfig {
n_vocab: 512,
hidden_channels: 384,
filter_channels: 1536,
n_heads: 2,
n_layers: 6,
kernel_size: 3,
p_dropout: 0.1,
},
// ... other config
};
let model = VitsModel::from_config(config, "path/to/weights.safetensors").await?;
| Model | Backend | Device | RTF | Throughput | Memory |
|---|---|---|---|---|---|
| VITS-EN | Candle | CPU | 0.28× | 45 sent/s | 512MB |
| VITS-EN | Candle | CUDA | 0.04× | 320 sent/s | 2.1GB |
| VITS-EN | ONNX | CPU | 0.31× | 42 sent/s | 480MB |
| VITS-EN | ONNX | CUDA | 0.05× | 290 sent/s | 1.8GB |
| FastSpeech2 | Candle | CPU | 0.15× | 78 sent/s | 384MB |
| FastSpeech2 | Candle | CUDA | 0.02× | 450 sent/s | 1.2GB |
Add to your Cargo.toml:
[dependencies]
voirs-acoustic = "0.1"
# Enable specific backends
[dependencies.voirs-acoustic]
version = "0.1"
features = ["candle", "onnx", "gpu"]
candle: Enable Candle backend (default)onnx: Enable ONNX Runtime backendgpu: Enable GPU acceleration (CUDA/Metal)streaming: Enable streaming synthesistraining: Enable model training capabilitiesplotting: Enable mel spectrogram visualizationscirs: Integration with SciRS2 for optimized DSPnumrs: Integration with NumRS2 for linear algebraCUDA backend:
# Ensure CUDA 11.8+ is installed
export CUDA_ROOT=/usr/local/cuda
export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
ONNX backend:
# Ubuntu/Debian
sudo apt-get install libonnxruntime-dev
# macOS
brew install onnxruntime
Create ~/.voirs/acoustic.toml:
[default]
backend = "candle" # candle, onnx
device = "auto" # auto, cpu, cuda:0, metal
precision = "fp32" # fp32, fp16
[models]
cache_dir = "~/.voirs/models/acoustic"
auto_download = true
verify_checksums = true
[candle]
enable_flash_attention = true
use_memory_pool = true
optimize_for_inference = true
[onnx]
provider = "cuda" # cpu, cuda, tensorrt
inter_op_threads = 4
intra_op_threads = 8
enable_profiling = false
[synthesis]
default_sample_rate = 22050
default_hop_length = 256
default_n_mels = 80
max_sequence_length = 1000
[streaming]
chunk_size = 256
overlap = 64
max_latency_ms = 50
buffer_size = 1024
use voirs_acoustic::{VitsTrainer, TrainingConfig, DataLoader};
let config = TrainingConfig {
model: VitsConfig::default(),
optimizer: OptimizerConfig::adam(1e-4),
scheduler: SchedulerConfig::exponential(0.999),
batch_size: 32,
gradient_accumulation: 4,
max_epochs: 1000,
..Default::default()
};
let trainer = VitsTrainer::new(config);
let dataloader = DataLoader::from_manifest("train_manifest.json").await?;
trainer.train(dataloader).await?;
use voirs_acoustic::{FineTuner, SpeakerConfig};
let base_model = VitsModel::from_pretrained("vits-en-us-base").await?;
let fine_tuner = FineTuner::new(base_model);
let speaker_config = SpeakerConfig {
speaker_data: "path/to/speaker/audio",
target_hours: 2.0, // 2 hours of data
learning_rate: 1e-5,
freeze_encoder: true, // Only fine-tune decoder
};
let custom_model = fine_tuner.fine_tune(speaker_config).await?;
use voirs_acoustic::{AcousticError, ErrorKind};
match model.synthesize(&phonemes, None).await {
Ok(mel) => println!("Success: {} frames", mel.n_frames()),
Err(AcousticError { kind, context, .. }) => match kind {
ErrorKind::ModelNotFound => {
eprintln!("Model not found: {}", context);
}
ErrorKind::InvalidInput => {
eprintln!("Invalid phoneme sequence: {}", context);
}
ErrorKind::InferenceError => {
eprintln!("Model inference failed: {}", context);
}
ErrorKind::DeviceError => {
eprintln!("GPU/device error: {}", context);
}
_ => eprintln!("Other error: {}", context),
}
}
use voirs_acoustic::{VoiceMorpher, MorphingConfig};
let morpher = VoiceMorpher::new();
let config = MorphingConfig {
source_speaker: 0,
target_speaker: 1,
interpolation_factor: 0.5, // 50% blend
preserve_prosody: true,
};
let morphed_mel = morpher.morph(&base_mel, &config).await?;
#[cfg(feature = "plotting")]
use voirs_acoustic::{AttentionVisualizer, VisualizationConfig};
let visualizer = AttentionVisualizer::new();
let attention_weights = model.get_attention_weights(&phonemes).await?;
let plot = visualizer.plot_attention(
&attention_weights,
&phonemes,
&mel,
VisualizationConfig::default()
);
plot.save("attention.png")?;
We welcome contributions! Please see the main repository for contribution guidelines.
git clone https://github.com/cool-japan/voirs.git
cd voirs/crates/voirs-acoustic
# Install development dependencies
cargo install cargo-nextest criterion
# Run tests
cargo nextest run
# Run benchmarks
cargo bench
# Check code quality
cargo clippy -- -D warnings
cargo fmt --check
AcousticModel traitLicensed under either of:
at your option.