Spaces:
Build error
Build error
| import torch | |
| import torchaudio | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import List, Union | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from speechbrain.pretrained import EncoderClassifier | |
| import tempfile | |
| import os | |
| class VoiceCloneSystem: | |
| """语音克隆系统:将输入文本转换为目标说话人的语音""" | |
| def __init__(self, device: str = "cpu"): | |
| """ | |
| 初始化语音克隆系统 | |
| Args: | |
| device: 使用的设备,'cpu' 或 'cuda' | |
| """ | |
| self.device = device | |
| print("正在加载模型...") | |
| # 加载说话人编码器 | |
| self.speaker_encoder = EncoderClassifier.from_hparams( | |
| source="speechbrain/spkrec-xvect-voxceleb", | |
| savedir="tmp/spkrec-xvect-voxceleb", | |
| run_opts={"device": device} | |
| ) | |
| # 加载文本到语音模型 | |
| self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| self.tts_model = SpeechT5ForTextToSpeech.from_pretrained( | |
| "microsoft/speecht5_tts" | |
| ).to(device) | |
| # 加载声码器 | |
| self.vocoder = SpeechT5HifiGan.from_pretrained( | |
| "microsoft/speecht5_hifigan" | |
| ).to(device) | |
| print("模型加载完成!") | |
| def process_audio(self, waveform: torch.Tensor, sr: int) -> torch.Tensor: | |
| """ | |
| 处理音频:重采样和转换为单声道 | |
| Args: | |
| waveform: 输入音频波形 | |
| sr: 采样率 | |
| Returns: | |
| 处理后的音频波形 | |
| """ | |
| # 重采样到16kHz | |
| if sr != 16000: | |
| waveform = torchaudio.functional.resample(waveform, sr, 16000) | |
| # 确保音频是单声道 | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # 标准化音频长度(3秒) | |
| target_length = 16000 * 3 | |
| current_length = waveform.shape[1] | |
| if current_length > target_length: | |
| # 如果太长,截取中间部分 | |
| start = (current_length - target_length) // 2 | |
| waveform = waveform[:, start:start + target_length] | |
| elif current_length < target_length: | |
| # 如果太短,用0填充 | |
| padding = torch.zeros(1, target_length - current_length) | |
| waveform = torch.cat([waveform, padding], dim=1) | |
| return waveform | |
| def extract_speaker_embedding( | |
| self, | |
| audio_paths: List[Union[str, Path]] | |
| ) -> torch.Tensor: | |
| """ | |
| 从参考音频中提取说话人特征 | |
| Args: | |
| audio_paths: 参考音频文件路径列表 | |
| Returns: | |
| 说话人特征向量 | |
| """ | |
| embeddings = [] | |
| for audio_path in audio_paths: | |
| try: | |
| # 加载音频 | |
| waveform, sr = torchaudio.load(str(audio_path)) | |
| # 处理音频 | |
| waveform = self.process_audio(waveform, sr) | |
| # 提取特征 | |
| with torch.no_grad(): | |
| # 确保输入维度正确 [batch, time] | |
| if waveform.dim() == 2: | |
| waveform = waveform.squeeze(0) | |
| # 提取特征并处理维度 | |
| embedding = self.speaker_encoder.encode_batch(waveform.unsqueeze(0).to(self.device)) | |
| embedding = embedding.squeeze() # 移除所有维度为1的维度 | |
| # 打印中间结果 | |
| print(f"Raw embedding shape: {embedding.shape}") | |
| embeddings.append(embedding) | |
| except Exception as e: | |
| print(f"Error processing audio {audio_path}: {str(e)}") | |
| raise | |
| # 计算平均特征 | |
| mean_embedding = torch.stack(embeddings).mean(dim=0) | |
| # 确保最终维度正确 [1, 512] | |
| if mean_embedding.dim() == 1: | |
| mean_embedding = mean_embedding.unsqueeze(0) | |
| # 打印最终维度 | |
| print(f"Final embedding shape: {mean_embedding.shape}") | |
| return mean_embedding | |
| def generate_speech( | |
| self, | |
| text: str, | |
| speaker_embedding: torch.Tensor | |
| ) -> torch.Tensor: | |
| """ | |
| 生成语音 | |
| Args: | |
| text: 输入文本 | |
| speaker_embedding: 说话人特征向量 | |
| Returns: | |
| 生成的语音波形 | |
| """ | |
| try: | |
| # 处理输入文本 | |
| inputs = self.processor(text=text, return_tensors="pt") | |
| # 确保说话人特征维度正确 | |
| if speaker_embedding.dim() != 2 or speaker_embedding.size(1) != 512: | |
| raise ValueError(f"Speaker embedding should have shape [1, 512], but got {speaker_embedding.shape}") | |
| # 生成语音 | |
| speech = self.tts_model.generate_speech( | |
| inputs["input_ids"].to(self.device), | |
| speaker_embedding.to(self.device), | |
| vocoder=self.vocoder | |
| ) | |
| return speech | |
| except Exception as e: | |
| print(f"Error in generate_speech: {str(e)}") | |
| raise | |
| def clone_voice( | |
| self, | |
| text: str, | |
| reference_audio_paths: List[Union[str, Path]] | |
| ) -> torch.Tensor: | |
| """ | |
| 主函数:克隆声音 | |
| Args: | |
| text: 要转换的文本 | |
| reference_audio_paths: 参考音频文件路径列表 | |
| Returns: | |
| 生成的语音波形 | |
| """ | |
| try: | |
| # 1. 提取说话人特征 | |
| speaker_embedding = self.extract_speaker_embedding(reference_audio_paths) | |
| # 2. 生成语音 | |
| speech = self.generate_speech(text, speaker_embedding) | |
| return speech | |
| except Exception as e: | |
| print(f"Error in clone_voice: {str(e)}") | |
| raise | |
| def save_audio( | |
| self, | |
| waveform: torch.Tensor, | |
| output_path: Union[str, Path], | |
| sample_rate: int = 16000 | |
| ): | |
| """ | |
| 保存音频文件 | |
| Args: | |
| waveform: 音频波形 | |
| output_path: 输出文件路径 | |
| sample_rate: 采样率 | |
| """ | |
| try: | |
| # 确保输出目录存在 | |
| output_path = Path(output_path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| # 保存音频 | |
| torchaudio.save( | |
| str(output_path), | |
| waveform.unsqueeze(0).cpu(), | |
| sample_rate | |
| ) | |
| except Exception as e: | |
| print(f"Error saving audio: {str(e)}") | |
| raise |