kanahomaisa's picture
Update app.py
f3ce48a verified
import os
import sys
import gradio as gr
import numpy as np
import torch
import librosa
import torchaudio
from scipy.signal import resample
import time
import requests
from huggingface_hub import snapshot_download
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
preset_speakers = {
"6歲": {
"url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/smile_train.wav",
"transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來"
},
"8歲": {
"url": "https://huggingface.co/datasets/kanahomaisa/breezyvoice-samples/resolve/main/utensils.wav",
"transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?"
}
}
def download_audio_from_hf(url, save_path="temp_prompt.wav"):
response = requests.get(url)
response.raise_for_status()
with open(save_path, "wb") as f:
f.write(response.content)
return save_path
def apply_preset(speaker_key):
if speaker_key in preset_speakers:
url = preset_speakers[speaker_key]["url"]
transcription = preset_speakers[speaker_key]["transcription"]
local_path = download_audio_from_hf(url)
return local_path, transcription
return None, ""
max_val = 0.8
target_sr = 22050
prompt_sr = 16000
def download_with_retry(repo_id, max_retries=10, wait_sec=5):
for attempt in range(1, max_retries + 1):
try:
print(f"正在下載模型(第 {attempt} 次)")
model_dir = snapshot_download(repo_id)
print("模型下載成功")
return model_dir
except Exception as e:
print(f"第 {attempt} 次下載失敗:{e}")
if attempt < max_retries:
print(f"等待 {wait_sec} 秒後重試")
time.sleep(wait_sec)
else:
print("已達最大重試次數")
raise e
model_dir = download_with_retry("MediaTek-Research/BreezyVoice")
cosyvoice = CosyVoice(model_dir)
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
speech, _ = librosa.effects.trim(
speech, top_db=top_db,
frame_length=win_length,
hop_length=hop_length
)
if speech.abs().max() > max_val:
speech = speech / speech.abs().max() * max_val
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
return speech
def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key):
if speaker_key and speaker_key in preset_speakers:
prompt_wav_path = speaker_audio_path
if not speaker_text.strip():
speaker_text = preset_speakers[speaker_key]["transcription"]
else:
prompt_wav_path = speaker_audio_path
if not prompt_wav_path:
raise gr.Error("請上傳或選擇語音樣本")
if not speaker_text or len(speaker_text.strip()) < 5:
raise gr.Error("語音樣本的轉寫內容太短,至少輸入5字以上")
prompt_speech_16k = postprocess(load_wav(prompt_wav_path, prompt_sr))
output = cosyvoice.inference_zero_shot(content_text, speaker_text, prompt_speech_16k)
audio_data = output['tts_speech'].numpy().flatten()
os.makedirs("results", exist_ok=True)
out_path = "results/output.wav"
torchaudio.save(out_path, torch.tensor(audio_data).unsqueeze(0), sample_rate=target_sr)
return out_path
with gr.Blocks() as demo:
gr.Markdown("""
# 小睿語音合成
選擇預設語音 (6歲、8歲) 或上傳5~15秒語音樣本及該則語音樣本的轉寫,並輸入要合成的句子。
視該時段運算資源和語句長度而定約需70至2000秒,若等候時間過長,可改用較快但較不穩的版本:
https://colab.research.google.com/drive/15gfHseSyHhsQi8FMviwptJ95QllMcMOe?usp=sharing
""")
speaker_selector = gr.Dropdown(
label="選擇語音樣本(可選)",
choices=["", *preset_speakers.keys()],
value="",
interactive=True
)
with gr.Row():
audio_input = gr.Audio(label="上傳或套用語音樣本", type="filepath", interactive=True)
content_input = gr.Textbox(label="要合成的文字句子", placeholder="例如:今天天氣真好")
transcription_input = gr.Textbox(
label="語音樣本的轉寫(必填)",
placeholder="例如:只是呢你們看,這一輛微笑號它這裡有寫八百型的喔...",
lines=3
)
output_audio = gr.Audio(label="合成結果", type="filepath")
btn = gr.Button("開始語音合成")
speaker_selector.change(
fn=apply_preset,
inputs=speaker_selector,
outputs=[audio_input, transcription_input]
)
btn.click(
fn=synthesize_speech,
inputs=[audio_input, content_input, transcription_input, speaker_selector],
outputs=output_audio
)
gr.Markdown("""
為了加速,已關閉自動語音辨識,務必正確輸入語音樣本的文字轉寫。
""")
if __name__ == "__main__":
demo.launch()