| |
|
|
| use crate::{ |
| audio::{load_audio, save_audio, AudioConfig, AudioData}, |
| config::Config, |
| model::{EmotionEncoder, SamplingStrategy, SemanticEncoder, SpeakerEncoder}, |
| text::{TextNormalizer, TextTokenizer, TokenizerConfig}, |
| vocoder::{BigVGAN, BigVGANConfig, Vocoder}, Result, |
| }; |
| use ndarray::Array1; |
| use std::path::{Path, PathBuf}; |
| use std::time::Instant; |
|
|
| |
| #[derive(Debug, Clone)] |
| pub struct SynthesisOptions { |
| |
| pub emotion_vector: Option<Vec<f32>>, |
| |
| pub emotion_audio: Option<PathBuf>, |
| |
| pub emotion_alpha: f32, |
| |
| pub sampling: SamplingStrategy, |
| |
| pub repetition_penalty: f32, |
| |
| pub max_length: usize, |
| |
| pub segment_silence_ms: u32, |
| } |
|
|
| impl Default for SynthesisOptions { |
| fn default() -> Self { |
| Self { |
| emotion_vector: None, |
| emotion_audio: None, |
| emotion_alpha: 1.0, |
| sampling: SamplingStrategy::TopKP { k: 50, p: 0.95 }, |
| repetition_penalty: 1.1, |
| max_length: 250, |
| segment_silence_ms: 200, |
| } |
| } |
| } |
|
|
| |
| #[derive(Debug)] |
| pub struct SynthesisResult { |
| |
| pub audio: Vec<f32>, |
| |
| pub sample_rate: u32, |
| |
| pub duration: f32, |
| |
| pub processing_time: f32, |
| |
| pub rtf: f32, |
| } |
|
|
| impl SynthesisResult { |
| |
| pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> { |
| let audio_data = AudioData::new(self.audio.clone(), self.sample_rate); |
| save_audio(path, &audio_data) |
| } |
|
|
| |
| pub fn duration_formatted(&self) -> String { |
| let minutes = (self.duration / 60.0) as u32; |
| let seconds = (self.duration % 60.0) as u32; |
| format!("{:02}:{:02}", minutes, seconds) |
| } |
| } |
|
|
| |
| pub struct IndexTTS { |
| |
| normalizer: TextNormalizer, |
| |
| tokenizer: TextTokenizer, |
| |
| speaker_encoder: SpeakerEncoder, |
| |
| emotion_encoder: EmotionEncoder, |
| |
| semantic_encoder: SemanticEncoder, |
| |
| vocoder: BigVGAN, |
| |
| audio_config: AudioConfig, |
| |
| config: Config, |
| } |
|
|
| impl IndexTTS { |
| |
| pub fn new(config: Config) -> Result<Self> { |
| config.validate()?; |
|
|
| log::info!("Initializing IndexTTS..."); |
|
|
| |
| let normalizer = TextNormalizer::new(); |
| let tokenizer = TextTokenizer::new(TokenizerConfig { |
| model_path: config.dataset.bpe_model.display().to_string(), |
| vocab_size: config.dataset.vocab_size, |
| ..Default::default() |
| })?; |
|
|
| |
| let speaker_encoder = SpeakerEncoder::new_placeholder(192); |
| let emotion_encoder = EmotionEncoder::new( |
| config.emotions.num_dims, |
| config.emotions.num.clone(), |
| 256, |
| ); |
| let semantic_encoder = SemanticEncoder::new_placeholder(); |
|
|
| |
| let vocoder_config = BigVGANConfig { |
| sample_rate: config.s2mel.preprocess.sr, |
| num_mels: config.s2mel.preprocess.n_mels, |
| ..Default::default() |
| }; |
| let vocoder = BigVGAN::new_fallback(vocoder_config); |
|
|
| |
| let audio_config = AudioConfig { |
| sample_rate: config.s2mel.preprocess.sr, |
| n_fft: config.s2mel.preprocess.n_fft, |
| hop_length: config.s2mel.preprocess.hop_length, |
| win_length: config.s2mel.preprocess.win_length, |
| n_mels: config.s2mel.preprocess.n_mels, |
| fmin: config.s2mel.preprocess.fmin, |
| fmax: config.s2mel.preprocess.fmax, |
| }; |
|
|
| log::info!("IndexTTS initialized successfully"); |
|
|
| Ok(Self { |
| normalizer, |
| tokenizer, |
| speaker_encoder, |
| emotion_encoder, |
| semantic_encoder, |
| vocoder, |
| audio_config, |
| config, |
| }) |
| } |
|
|
| |
| pub fn load<P: AsRef<Path>>(config_path: P) -> Result<Self> { |
| let config = Config::load(config_path)?; |
| Self::new(config) |
| } |
|
|
| |
| pub fn synthesize( |
| &self, |
| text: &str, |
| speaker_audio_path: &str, |
| options: &SynthesisOptions, |
| ) -> Result<SynthesisResult> { |
| let start_time = Instant::now(); |
|
|
| log::info!("Starting synthesis for: {}", &text[..text.len().min(50)]); |
|
|
| |
| log::debug!("Normalizing text..."); |
| let normalized_text = self.normalizer.normalize(text)?; |
|
|
| |
| log::debug!("Tokenizing text..."); |
| let tokens = self.tokenizer.encode(&normalized_text)?; |
| log::debug!("Generated {} tokens", tokens.len()); |
|
|
| |
| log::debug!("Loading speaker audio..."); |
| let speaker_audio = load_audio(speaker_audio_path, Some(self.audio_config.sample_rate))?; |
|
|
| |
| log::debug!("Extracting speaker embedding..."); |
| let mel_spec = crate::audio::mel_spectrogram(&speaker_audio.samples, &self.audio_config)?; |
| let speaker_embedding = self.speaker_encoder.encode(&mel_spec)?; |
|
|
| |
| log::debug!("Extracting semantic codes..."); |
| let semantic_codes = self |
| .semantic_encoder |
| .encode(&speaker_audio.samples, self.audio_config.sample_rate)?; |
|
|
| |
| log::debug!("Preparing emotion conditioning..."); |
| let emotion_embedding = if let Some(ref emo_vec) = options.emotion_vector { |
| let emo = self.emotion_encoder.apply_strength(emo_vec, options.emotion_alpha); |
| self.emotion_encoder.encode(&emo)? |
| } else { |
| let neutral = self.emotion_encoder.neutral(); |
| self.emotion_encoder.encode(&neutral)? |
| }; |
|
|
| |
| log::debug!("Generating mel spectrogram..."); |
| let mel_length = (tokens.len() as f32 * 2.5) as usize; |
| let mel_spec = self.generate_mel_spectrogram( |
| &tokens, |
| &semantic_codes, |
| &speaker_embedding, |
| &emotion_embedding, |
| mel_length, |
| )?; |
|
|
| |
| log::debug!("Running vocoder..."); |
| let audio = self.vocoder.synthesize(&mel_spec)?; |
|
|
| |
| log::debug!("Post-processing..."); |
| let audio = self.post_process(&audio); |
|
|
| let processing_time = start_time.elapsed().as_secs_f32(); |
| let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
| let rtf = processing_time / duration; |
|
|
| log::info!( |
| "Synthesis complete: {:.2}s audio in {:.2}s (RTF: {:.3})", |
| duration, |
| processing_time, |
| rtf |
| ); |
|
|
| Ok(SynthesisResult { |
| audio, |
| sample_rate: self.vocoder.sample_rate(), |
| duration, |
| processing_time, |
| rtf, |
| }) |
| } |
|
|
| |
| pub fn synthesize_to_file( |
| &self, |
| text: &str, |
| speaker_audio_path: &str, |
| output_path: &str, |
| options: &SynthesisOptions, |
| ) -> Result<SynthesisResult> { |
| let result = self.synthesize(text, speaker_audio_path, options)?; |
| result.save(output_path)?; |
| log::info!("Saved audio to: {}", output_path); |
| Ok(result) |
| } |
|
|
| |
| fn generate_mel_spectrogram( |
| &self, |
| _tokens: &[i64], |
| _semantic_codes: &[i64], |
| _speaker_embedding: &Array1<f32>, |
| _emotion_embedding: &Array1<f32>, |
| mel_length: usize, |
| ) -> Result<ndarray::Array2<f32>> { |
| |
| |
|
|
| use rand::Rng; |
| let mut rng = rand::thread_rng(); |
|
|
| let n_mels = self.audio_config.n_mels; |
| let mut mel = ndarray::Array2::zeros((n_mels, mel_length)); |
|
|
| |
| for t in 0..mel_length { |
| for freq in 0..n_mels { |
| |
| let base_value = -4.0 + (freq as f32 / n_mels as f32) * 2.0; |
| let time_mod = ((t as f32 * 0.1).sin() + 1.0) * 0.5; |
| let noise = rng.gen_range(-0.5..0.5); |
| mel[[freq, t]] = base_value + time_mod + noise; |
| } |
| } |
|
|
| Ok(mel) |
| } |
|
|
| |
| fn post_process(&self, audio: &[f32]) -> Vec<f32> { |
| use crate::audio::{normalize_audio_peak, apply_fade}; |
|
|
| |
| let normalized = normalize_audio_peak(audio, 0.89); |
|
|
| |
| let fade_samples = (self.audio_config.sample_rate as f32 * 0.005) as usize; |
| apply_fade(&normalized, fade_samples, fade_samples) |
| } |
|
|
| |
| pub fn synthesize_long( |
| &self, |
| text: &str, |
| speaker_audio_path: &str, |
| options: &SynthesisOptions, |
| ) -> Result<SynthesisResult> { |
| let start_time = Instant::now(); |
|
|
| |
| let segments = super::segment_text(text, 100); |
| log::info!("Split text into {} segments", segments.len()); |
|
|
| |
| let mut audio_segments = Vec::new(); |
| for (i, segment) in segments.iter().enumerate() { |
| log::info!("Synthesizing segment {}/{}", i + 1, segments.len()); |
| let result = self.synthesize(segment, speaker_audio_path, options)?; |
| audio_segments.push(result.audio); |
| } |
|
|
| |
| let audio = super::concatenate_audio( |
| &audio_segments, |
| options.segment_silence_ms, |
| self.vocoder.sample_rate(), |
| ); |
|
|
| let processing_time = start_time.elapsed().as_secs_f32(); |
| let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
| let rtf = processing_time / duration; |
|
|
| Ok(SynthesisResult { |
| audio, |
| sample_rate: self.vocoder.sample_rate(), |
| duration, |
| processing_time, |
| rtf, |
| }) |
| } |
|
|
| |
| pub fn sample_rate(&self) -> u32 { |
| self.vocoder.sample_rate() |
| } |
|
|
| |
| pub fn config(&self) -> &Config { |
| &self.config |
| } |
| } |
|
|
| #[cfg(test)] |
| mod tests { |
| use super::*; |
|
|
| #[test] |
| fn test_synthesis_options_default() { |
| let options = SynthesisOptions::default(); |
| assert_eq!(options.emotion_alpha, 1.0); |
| assert!(matches!(options.sampling, SamplingStrategy::TopKP { .. })); |
| } |
|
|
| #[test] |
| fn test_synthesis_result_duration() { |
| let result = SynthesisResult { |
| audio: vec![0.0; 22050 * 125], |
| sample_rate: 22050, |
| duration: 125.0, |
| processing_time: 10.0, |
| rtf: 0.08, |
| }; |
|
|
| assert_eq!(result.duration_formatted(), "02:05"); |
| } |
|
|
| #[test] |
| fn test_segment_text() { |
| let text = "This is sentence one. This is sentence two. This is sentence three."; |
| let segments = super::super::segment_text(text, 50); |
| assert!(segments.len() >= 2); |
| } |
|
|
| #[test] |
| fn test_concatenate_audio() { |
| let seg1 = vec![1.0f32; 100]; |
| let seg2 = vec![2.0f32; 100]; |
| let result = super::super::concatenate_audio(&[seg1, seg2], 10, 1000); |
| |
| assert_eq!(result.len(), 210); |
| } |
| } |
|
|