import os import sys import gradio as gr import numpy as np import torch import librosa import torchaudio from scipy.signal import resample import time import requests from huggingface_hub import snapshot_download ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS') from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav preset_speakers = { "6歲": { "url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/smile_train.wav", "transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來" }, "8歲": { "url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/utensils.wav", "transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?" } } def download_audio_from_hf(url, save_path="temp_prompt.wav"): response = requests.get(url) response.raise_for_status() with open(save_path, "wb") as f: f.write(response.content) return save_path def apply_preset(speaker_key): if speaker_key in preset_speakers: url = preset_speakers[speaker_key]["url"] transcription = preset_speakers[speaker_key]["transcription"] local_path = download_audio_from_hf(url) return local_path, transcription return None, "" max_val = 0.8 target_sr = 22050 prompt_sr = 16000 def download_with_retry(repo_id, max_retries=10, wait_sec=5): for attempt in range(1, max_retries + 1): try: print(f"正在下載模型(第 {attempt} 次)") model_dir = snapshot_download(repo_id) print("模型下載成功") return model_dir except Exception as e: print(f"第 {attempt} 次下載失敗:{e}") if attempt < max_retries: print(f"等待 {wait_sec} 秒後重試") time.sleep(wait_sec) else: print("已達最大重試次數") raise e model_dir = download_with_retry("MediaTek-Research/BreezyVoice") cosyvoice = CosyVoice(model_dir) def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) return speech def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key): if speaker_key and speaker_key in preset_speakers: prompt_wav_path = speaker_audio_path if not speaker_text.strip(): speaker_text = preset_speakers[speaker_key]["transcription"] else: prompt_wav_path = speaker_audio_path if not prompt_wav_path: raise gr.Error("請上傳或選擇語音樣本") if not speaker_text or len(speaker_text.strip()) < 5: raise gr.Error("語音樣本的轉寫內容太短,至少輸入5字以上") prompt_speech_16k = postprocess(load_wav(prompt_wav_path, prompt_sr)) output = cosyvoice.inference_zero_shot(content_text, speaker_text, prompt_speech_16k) audio_data = output['tts_speech'].numpy().flatten() os.makedirs("results", exist_ok=True) out_path = "results/output.wav" torchaudio.save(out_path, torch.tensor(audio_data).unsqueeze(0), sample_rate=target_sr) return out_path with gr.Blocks() as demo: gr.Markdown(""" # 小睿語音合成 選擇預設語音 (6歲、8歲) 或上傳5~15秒語音樣本及該則語音樣本的轉寫,並輸入要合成的句子。 視該時段運算資源和語句長度而定約需70至2000秒,若等候時間過長,可改用較快但較不穩的版本: https://colab.research.google.com/drive/15gfHseSyHhsQi8FMviwptJ95QllMcMOe?usp=sharing """) speaker_selector = gr.Dropdown( label="選擇語音樣本(可選)", choices=["", *preset_speakers.keys()], value="", interactive=True ) with gr.Row(): audio_input = gr.Audio(label="上傳或套用語音樣本", type="filepath", interactive=True) content_input = gr.Textbox(label="要合成的文字句子", placeholder="例如:今天天氣真好") transcription_input = gr.Textbox( label="語音樣本的轉寫(必填)", placeholder="例如:只是呢你們看,這一輛微笑號它這裡有寫八百型的喔...", lines=3 ) output_audio = gr.Audio(label="合成結果", type="filepath") btn = gr.Button("開始語音合成") speaker_selector.change( fn=apply_preset, inputs=speaker_selector, outputs=[audio_input, transcription_input] ) btn.click( fn=synthesize_speech, inputs=[audio_input, content_input, transcription_input, speaker_selector], outputs=output_audio ) gr.Markdown(""" 為了加速,已關閉自動語音辨識,務必正確輸入語音樣本的文字轉寫。 """) if __name__ == "__main__": demo.launch()