kanahomaisa's picture
Update app.py
58446ec verified
import os
import sys
import gradio as gr
import numpy as np
import torch
import librosa
import torchaudio
import time
import requests
from huggingface_hub import snapshot_download
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
APP_TITLE = "思語驛站 (Remembrance Station)"
CHILD_NAME = "小睿"
preset_speakers = {
f"{CHILD_NAME}(6歲)": {
"url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/smile_train.wav",
"transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來"
},
f"{CHILD_NAME}(8歲)": {
"url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/utensils.wav",
"transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?"
}
}
def download_audio_from_hf(url, save_path="temp_prompt.wav"):
try:
response = requests.get(url, timeout=15)
response.raise_for_status()
with open(save_path, "wb") as f:
f.write(response.content)
return save_path
except requests.exceptions.RequestException as e:
print(f"下載音檔失敗: {url}, 錯誤: {e}")
raise gr.Error(f"無法載入預設聲音片段({e})")
def apply_preset(speaker_key):
if speaker_key and speaker_key in preset_speakers:
try:
url = preset_speakers[speaker_key]["url"]
transcription = preset_speakers[speaker_key]["transcription"]
local_path = download_audio_from_hf(url)
return local_path, transcription
except gr.Error as e:
print(f"無法載入預設聲音: {e}")
return None, ""
except Exception as e:
print(f"套用預設時發生錯誤: {e}")
print(f"處理預設聲音時發生錯誤,請稍後再試")
return None, ""
return None, ""
max_val = 0.8
target_sr = 22050
prompt_sr = 16000
def download_with_retry(repo_id, max_retries=5, wait_sec=5):
for attempt in range(1, max_retries + 1):
try:
print(f"正在下載模型(嘗試 {attempt}/{max_retries})")
model_dir = snapshot_download(repo_id, cache_dir="./hf_cache")
print("準備完成")
return model_dir
except Exception as e:
print(f"第 {attempt} 次準備失敗:{e}")
if attempt < max_retries:
print(f"等待 {wait_sec} 秒後重試...")
time.sleep(wait_sec)
else:
print("已達最大重試次數")
raise ConnectionError(f"無法下載模型 {repo_id}")
try:
model_dir = download_with_retry("MediaTek-Research/BreezyVoice")
cosyvoice = CosyVoice(model_dir)
print("準備完成")
except ConnectionError as e:
print(f"初始化錯誤: {e}")
cosyvoice = None
except Exception as e:
print(f"初始化過程中發生未知錯誤: {e}")
import traceback
traceback.print_exc()
cosyvoice = None
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
try:
if isinstance(speech, np.ndarray):
speech = torch.from_numpy(speech)
if speech.ndim == 1:
speech = speech.unsqueeze(0)
speech_np = speech.squeeze().cpu().numpy()
if np.max(np.abs(speech_np)) < 1e-5:
print("警告: 輸入音頻近乎靜音,跳過修剪")
trimmed_speech_np = speech_np
else:
trimmed_speech_np, _ = librosa.effects.trim(
speech_np, top_db=top_db,
frame_length=win_length,
hop_length=hop_length
)
trimmed_speech = torch.from_numpy(trimmed_speech_np).unsqueeze(0)
if trimmed_speech.numel() > 0 and trimmed_speech.abs().max() > 1e-5:
abs_max = trimmed_speech.abs().max()
if abs_max > max_val:
trimmed_speech = trimmed_speech / abs_max * max_val
else:
print("警告: 修剪後的音頻近乎靜音或為空,跳過標準化")
silence = torch.zeros(1, int(target_sr * 0.2), dtype=trimmed_speech.dtype)
speech_final = torch.concat([trimmed_speech.to(silence.device), silence], dim=1)
return speech_final
except Exception as e:
print(f"音訊後處理錯誤: {e}")
import traceback
traceback.print_exc()
print("警告: 後處理失敗,可能影響輸出音質。返回原始處理前音頻")
if speech.ndim == 1:
return speech.unsqueeze(0)
return speech
def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key):
if cosyvoice is None:
raise gr.Error("啟動失敗,請重啟")
start_time = time.time()
print("開始準備聲音片段")
prompt_wav_path = None
transcription = ""
if speaker_key and speaker_key in preset_speakers:
print(f"使用預設樣本: {speaker_key}")
try:
preset_url = preset_speakers[speaker_key]["url"]
preset_transcription = preset_speakers[speaker_key]["transcription"]
prompt_wav_path = "temp_prompt.wav"
if not os.path.exists(prompt_wav_path) or speaker_key != getattr(synthesize_speech, '_last_preset_key', None):
print(f"下載或更新預設樣本: {speaker_key}")
prompt_wav_path = download_audio_from_hf(preset_url, prompt_wav_path)
synthesize_speech._last_preset_key = speaker_key
else:
print("使用已下載的預設樣本")
transcription = preset_transcription if not speaker_text.strip() else speaker_text.strip()
except Exception as e:
raise gr.Error(f"處理預設樣本時出錯: {e}")
elif speaker_audio_path:
print(f"使用上傳樣本: {speaker_audio_path}")
prompt_wav_path = speaker_audio_path
transcription = speaker_text.strip()
else:
raise gr.Error("請選擇一個預設聲音,或上傳聲音片段")
if not prompt_wav_path or not os.path.exists(prompt_wav_path):
raise gr.Error(f"聲音片段未能載入 ({prompt_wav_path}),請重試。")
if not transcription or len(transcription) < 5:
raise gr.Error("聲音片段的文字記錄太短或未提供,請輸入至少5個字。準確的記錄有助於生成更自然的聲音")
if not content_text or not content_text.strip():
raise gr.Error("請輸入想聽到的句子")
print(f"聲音樣本路徑: {prompt_wav_path}")
print(f"樣本文字記錄: {transcription}")
print(f"目標合成文字: {content_text}")
try:
print("正在分析聲音")
prompt_speech_orig, sr = torchaudio.load(prompt_wav_path)
if sr != prompt_sr:
print(f"轉換提示音頻採樣率: {sr} -> {prompt_sr}")
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=prompt_sr)
prompt_speech_16k_tensor = resampler(prompt_speech_orig)
else:
prompt_speech_16k_tensor = prompt_speech_orig
if prompt_speech_16k_tensor.shape[0] > 1:
prompt_speech_16k_tensor = torch.mean(prompt_speech_16k_tensor, dim=0, keepdim=True)
prompt_speech_processed = postprocess(prompt_speech_16k_tensor)
if prompt_speech_processed.numel() == 0:
raise gr.Error("處理後的聲音片段為空,請檢查原始音檔")
print("正在生成...")
output = cosyvoice.inference_zero_shot(content_text, transcription, prompt_speech_processed)
print("推理完成")
if 'tts_speech' not in output or output['tts_speech'] is None:
raise gr.Error("請嘗試調整輸入文本或聲音片段")
audio_data_tensor = output['tts_speech']
if isinstance(audio_data_tensor, np.ndarray):
audio_data_tensor = torch.from_numpy(audio_data_tensor)
if audio_data_tensor.ndim == 1:
audio_data_tensor = audio_data_tensor.unsqueeze(0)
if audio_data_tensor.numel() == 0:
raise gr.Error("生成結果為空")
print("即將完成")
os.makedirs("results", exist_ok=True)
timestamp = int(time.time())
out_path = f"results/output_{timestamp}.wav"
torchaudio.save(out_path, audio_data_tensor.cpu(), sample_rate=target_sr)
print(f"合成結果已保存至: {out_path}")
end_time = time.time()
print(f"合成耗時: {end_time - start_time:.2f} 秒")
return out_path
except FileNotFoundError:
raise gr.Error(f"找不到聲音片段檔案:{prompt_wav_path},請重新上傳或選擇")
except Exception as e:
print(f"語音合成過程中發生錯誤: {e}")
import traceback
traceback.print_exc()
error_message = f" ({e}) 請檢查輸入內容或稍後再試一次"
if "CUDA" in str(e) and "out of memory" in str(e):
error_message = "請嘗試合成較短的句子或稍後再試"
elif "break model index not valid" in str(e) or "load tokenizer failed" in str(e):
error_message = "錯誤"
raise gr.Error(error_message)
with gr.Blocks(title=APP_TITLE) as demo:
gr.Markdown(f"""
# 思語驛站Remembrance Station
透過先進的聲音技術,盡可能重現熟悉的**{CHILD_NAME}**聲音
**用法:**
1. **選擇時光膠囊 (可選):** 從下拉選單中選擇一個預設的聲音片段 (來自**{CHILD_NAME}**不同時期的聲音)
2. **或自己上傳語音:** 上傳一段**{CHILD_NAME}**的清晰錄音 (建議 5-15 秒,雜音越少越好,若太短生成時會有雜音),欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項
3. **記錄當時的話語 (必填):** 在下方文字框中,**準確輸入**該段錄音中的**每一句話包含標點符號**
4. **想聽{CHILD_NAME}說的話:** 輸入希望透過**{CHILD_NAME}**的聲音說出的句子
5. **點擊「開始」:** 耐心等待聲音合成,耗時視句子長度和該時段運算資源而定需60~2000秒(例:16字,晚上11點,花400秒)
""")
with gr.Row():
with gr.Column(scale=1):
speaker_selector = gr.Dropdown(
label="選擇時光膠囊 (聲音片段)",
choices=[""] + list(preset_speakers.keys()),
value="",
interactive=True
)
audio_input = gr.Audio(
label=f"上傳/錄製{CHILD_NAME}的聲音片段",
sources=["upload", "microphone"],
type="filepath",
interactive=True
)
transcription_input = gr.Textbox(
label="聲音片段的文字記錄 (必填)",
placeholder=f"一字不差地輸入上方聲音片段裡{CHILD_NAME}說的每一句話包含標點符號",
lines=3,
interactive=True
)
with gr.Column(scale=2):
content_input = gr.Textbox(
label=f"想聽{CHILD_NAME}說的話",
placeholder="例如:天氣真好",
lines=5,
interactive=True
)
output_audio = gr.Audio(
label="結果",
type="filepath",
interactive=False
)
btn = gr.Button("開始")
speaker_selector.change(
fn=apply_preset,
inputs=speaker_selector,
outputs=[audio_input, transcription_input]
)
btn.click(
fn=synthesize_speech,
inputs=[audio_input, content_input, transcription_input, speaker_selector],
outputs=output_audio
)
gr.Markdown(f"""
---
* 為了讓**{CHILD_NAME}**的聲音更自然,提供的「聲音片段」和「文字記錄」需要非常匹配。聲音片段建議 5~15 秒,背景噪音低的清晰人聲,標點符號也很重要
* 欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項
* 多人使用時要排隊,如果等待時間過長,或效果不如預期,可以嘗試使用較短的句子,或換一個聲音片段試試看
* 技術仍在發展中,耗時視句子長度和該時段運算資源而定需200~2000秒,而且雖然模仿聲音,但無法複製獨一無二的人
* 如果遇到困難,或想嘗試運算更快的版本https://colab.research.google.com/drive/11LmxpL4IsQ8rwjki7Qc_84qqYkGskTZI?usp=sharing
""")
if __name__ == "__main__":
if cosyvoice is None:
print("錯誤:合成引擎仔入失敗")
with gr.Blocks() as demo_error:
gr.Markdown("啟動失敗,重開試試")
demo_error.launch()
else:
demo.launch()