|
import os |
|
import sys |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import librosa |
|
import torchaudio |
|
from scipy.signal import resample |
|
import time |
|
import requests |
|
|
|
from huggingface_hub import snapshot_download |
|
|
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS') |
|
|
|
from cosyvoice.cli.cosyvoice import CosyVoice |
|
from cosyvoice.utils.file_utils import load_wav |
|
|
|
preset_speakers = { |
|
"6歲": { |
|
"url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/smile_train.wav", |
|
"transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來" |
|
}, |
|
"8歲": { |
|
"url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/utensils.wav", |
|
"transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?" |
|
} |
|
} |
|
|
|
def download_audio_from_hf(url, save_path="temp_prompt.wav"): |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
with open(save_path, "wb") as f: |
|
f.write(response.content) |
|
return save_path |
|
|
|
def apply_preset(speaker_key): |
|
if speaker_key in preset_speakers: |
|
url = preset_speakers[speaker_key]["url"] |
|
transcription = preset_speakers[speaker_key]["transcription"] |
|
local_path = download_audio_from_hf(url) |
|
return local_path, transcription |
|
return None, "" |
|
|
|
max_val = 0.8 |
|
target_sr = 22050 |
|
prompt_sr = 16000 |
|
|
|
def download_with_retry(repo_id, max_retries=10, wait_sec=5): |
|
for attempt in range(1, max_retries + 1): |
|
try: |
|
print(f"正在下載模型(第 {attempt} 次)") |
|
model_dir = snapshot_download(repo_id) |
|
print("模型下載成功") |
|
return model_dir |
|
except Exception as e: |
|
print(f"第 {attempt} 次下載失敗:{e}") |
|
if attempt < max_retries: |
|
print(f"等待 {wait_sec} 秒後重試") |
|
time.sleep(wait_sec) |
|
else: |
|
print("已達最大重試次數") |
|
raise e |
|
|
|
model_dir = download_with_retry("MediaTek-Research/BreezyVoice") |
|
cosyvoice = CosyVoice(model_dir) |
|
|
|
def postprocess(speech, top_db=60, hop_length=220, win_length=440): |
|
speech, _ = librosa.effects.trim( |
|
speech, top_db=top_db, |
|
frame_length=win_length, |
|
hop_length=hop_length |
|
) |
|
if speech.abs().max() > max_val: |
|
speech = speech / speech.abs().max() * max_val |
|
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) |
|
return speech |
|
|
|
def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key): |
|
if speaker_key and speaker_key in preset_speakers: |
|
prompt_wav_path = speaker_audio_path |
|
if not speaker_text.strip(): |
|
speaker_text = preset_speakers[speaker_key]["transcription"] |
|
else: |
|
prompt_wav_path = speaker_audio_path |
|
|
|
if not prompt_wav_path: |
|
raise gr.Error("請上傳或選擇語音樣本") |
|
if not speaker_text or len(speaker_text.strip()) < 5: |
|
raise gr.Error("語音樣本的轉寫內容太短,至少輸入5字以上") |
|
|
|
prompt_speech_16k = postprocess(load_wav(prompt_wav_path, prompt_sr)) |
|
output = cosyvoice.inference_zero_shot(content_text, speaker_text, prompt_speech_16k) |
|
|
|
audio_data = output['tts_speech'].numpy().flatten() |
|
|
|
os.makedirs("results", exist_ok=True) |
|
out_path = "results/output.wav" |
|
torchaudio.save(out_path, torch.tensor(audio_data).unsqueeze(0), sample_rate=target_sr) |
|
return out_path |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
# 小睿語音合成 |
|
選擇預設語音 (6歲、8歲) 或上傳5~15秒語音樣本及該則語音樣本的轉寫,並輸入要合成的句子。 |
|
視該時段運算資源和語句長度而定約需70至2000秒,若等候時間過長,可改用較快但較不穩的版本: |
|
https://colab.research.google.com/drive/15gfHseSyHhsQi8FMviwptJ95QllMcMOe?usp=sharing |
|
""") |
|
|
|
speaker_selector = gr.Dropdown( |
|
label="選擇語音樣本(可選)", |
|
choices=["", *preset_speakers.keys()], |
|
value="", |
|
interactive=True |
|
) |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(label="上傳或套用語音樣本", type="filepath", interactive=True) |
|
content_input = gr.Textbox(label="要合成的文字句子", placeholder="例如:今天天氣真好") |
|
transcription_input = gr.Textbox( |
|
label="語音樣本的轉寫(必填)", |
|
placeholder="例如:只是呢你們看,這一輛微笑號它這裡有寫八百型的喔...", |
|
lines=3 |
|
) |
|
|
|
output_audio = gr.Audio(label="合成結果", type="filepath") |
|
btn = gr.Button("開始語音合成") |
|
|
|
speaker_selector.change( |
|
fn=apply_preset, |
|
inputs=speaker_selector, |
|
outputs=[audio_input, transcription_input] |
|
) |
|
|
|
btn.click( |
|
fn=synthesize_speech, |
|
inputs=[audio_input, content_input, transcription_input, speaker_selector], |
|
outputs=output_audio |
|
) |
|
|
|
gr.Markdown(""" |
|
為了加速,已關閉自動語音辨識,務必正確輸入語音樣本的文字轉寫。 |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |