Spaces:
Running
Running
import os | |
import sys | |
import gradio as gr | |
import numpy as np | |
import torch | |
import librosa | |
import torchaudio | |
import time | |
import requests | |
from huggingface_hub import snapshot_download | |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS') | |
from cosyvoice.cli.cosyvoice import CosyVoice | |
from cosyvoice.utils.file_utils import load_wav | |
APP_TITLE = "思語驛站 (Remembrance Station)" | |
CHILD_NAME = "小睿" | |
preset_speakers = { | |
f"{CHILD_NAME}(6歲)": { | |
"url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/smile_train.wav", | |
"transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來" | |
}, | |
f"{CHILD_NAME}(8歲)": { | |
"url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/utensils.wav", | |
"transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?" | |
} | |
} | |
def download_audio_from_hf(url, save_path="temp_prompt.wav"): | |
try: | |
response = requests.get(url, timeout=15) | |
response.raise_for_status() | |
with open(save_path, "wb") as f: | |
f.write(response.content) | |
return save_path | |
except requests.exceptions.RequestException as e: | |
print(f"下載音檔失敗: {url}, 錯誤: {e}") | |
raise gr.Error(f"無法載入預設聲音片段({e})") | |
def apply_preset(speaker_key): | |
if speaker_key and speaker_key in preset_speakers: | |
try: | |
url = preset_speakers[speaker_key]["url"] | |
transcription = preset_speakers[speaker_key]["transcription"] | |
local_path = download_audio_from_hf(url) | |
return local_path, transcription | |
except gr.Error as e: | |
print(f"無法載入預設聲音: {e}") | |
return None, "" | |
except Exception as e: | |
print(f"套用預設時發生錯誤: {e}") | |
print(f"處理預設聲音時發生錯誤,請稍後再試") | |
return None, "" | |
return None, "" | |
max_val = 0.8 | |
target_sr = 22050 | |
prompt_sr = 16000 | |
def download_with_retry(repo_id, max_retries=5, wait_sec=5): | |
for attempt in range(1, max_retries + 1): | |
try: | |
print(f"正在下載模型(嘗試 {attempt}/{max_retries})") | |
model_dir = snapshot_download(repo_id, cache_dir="./hf_cache") | |
print("準備完成") | |
return model_dir | |
except Exception as e: | |
print(f"第 {attempt} 次準備失敗:{e}") | |
if attempt < max_retries: | |
print(f"等待 {wait_sec} 秒後重試...") | |
time.sleep(wait_sec) | |
else: | |
print("已達最大重試次數") | |
raise ConnectionError(f"無法下載模型 {repo_id}") | |
try: | |
model_dir = download_with_retry("MediaTek-Research/BreezyVoice") | |
cosyvoice = CosyVoice(model_dir) | |
print("準備完成") | |
except ConnectionError as e: | |
print(f"初始化錯誤: {e}") | |
cosyvoice = None | |
except Exception as e: | |
print(f"初始化過程中發生未知錯誤: {e}") | |
import traceback | |
traceback.print_exc() | |
cosyvoice = None | |
def postprocess(speech, top_db=60, hop_length=220, win_length=440): | |
try: | |
if isinstance(speech, np.ndarray): | |
speech = torch.from_numpy(speech) | |
if speech.ndim == 1: | |
speech = speech.unsqueeze(0) | |
speech_np = speech.squeeze().cpu().numpy() | |
if np.max(np.abs(speech_np)) < 1e-5: | |
print("警告: 輸入音頻近乎靜音,跳過修剪") | |
trimmed_speech_np = speech_np | |
else: | |
trimmed_speech_np, _ = librosa.effects.trim( | |
speech_np, top_db=top_db, | |
frame_length=win_length, | |
hop_length=hop_length | |
) | |
trimmed_speech = torch.from_numpy(trimmed_speech_np).unsqueeze(0) | |
if trimmed_speech.numel() > 0 and trimmed_speech.abs().max() > 1e-5: | |
abs_max = trimmed_speech.abs().max() | |
if abs_max > max_val: | |
trimmed_speech = trimmed_speech / abs_max * max_val | |
else: | |
print("警告: 修剪後的音頻近乎靜音或為空,跳過標準化") | |
silence = torch.zeros(1, int(target_sr * 0.2), dtype=trimmed_speech.dtype) | |
speech_final = torch.concat([trimmed_speech.to(silence.device), silence], dim=1) | |
return speech_final | |
except Exception as e: | |
print(f"音訊後處理錯誤: {e}") | |
import traceback | |
traceback.print_exc() | |
print("警告: 後處理失敗,可能影響輸出音質。返回原始處理前音頻") | |
if speech.ndim == 1: | |
return speech.unsqueeze(0) | |
return speech | |
def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key): | |
if cosyvoice is None: | |
raise gr.Error("啟動失敗,請重啟") | |
start_time = time.time() | |
print("開始準備聲音片段") | |
prompt_wav_path = None | |
transcription = "" | |
if speaker_key and speaker_key in preset_speakers: | |
print(f"使用預設樣本: {speaker_key}") | |
try: | |
preset_url = preset_speakers[speaker_key]["url"] | |
preset_transcription = preset_speakers[speaker_key]["transcription"] | |
prompt_wav_path = "temp_prompt.wav" | |
if not os.path.exists(prompt_wav_path) or speaker_key != getattr(synthesize_speech, '_last_preset_key', None): | |
print(f"下載或更新預設樣本: {speaker_key}") | |
prompt_wav_path = download_audio_from_hf(preset_url, prompt_wav_path) | |
synthesize_speech._last_preset_key = speaker_key | |
else: | |
print("使用已下載的預設樣本") | |
transcription = preset_transcription if not speaker_text.strip() else speaker_text.strip() | |
except Exception as e: | |
raise gr.Error(f"處理預設樣本時出錯: {e}") | |
elif speaker_audio_path: | |
print(f"使用上傳樣本: {speaker_audio_path}") | |
prompt_wav_path = speaker_audio_path | |
transcription = speaker_text.strip() | |
else: | |
raise gr.Error("請選擇一個預設聲音,或上傳聲音片段") | |
if not prompt_wav_path or not os.path.exists(prompt_wav_path): | |
raise gr.Error(f"聲音片段未能載入 ({prompt_wav_path}),請重試。") | |
if not transcription or len(transcription) < 5: | |
raise gr.Error("聲音片段的文字記錄太短或未提供,請輸入至少5個字。準確的記錄有助於生成更自然的聲音") | |
if not content_text or not content_text.strip(): | |
raise gr.Error("請輸入想聽到的句子") | |
print(f"聲音樣本路徑: {prompt_wav_path}") | |
print(f"樣本文字記錄: {transcription}") | |
print(f"目標合成文字: {content_text}") | |
try: | |
print("正在分析聲音") | |
prompt_speech_orig, sr = torchaudio.load(prompt_wav_path) | |
if sr != prompt_sr: | |
print(f"轉換提示音頻採樣率: {sr} -> {prompt_sr}") | |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=prompt_sr) | |
prompt_speech_16k_tensor = resampler(prompt_speech_orig) | |
else: | |
prompt_speech_16k_tensor = prompt_speech_orig | |
if prompt_speech_16k_tensor.shape[0] > 1: | |
prompt_speech_16k_tensor = torch.mean(prompt_speech_16k_tensor, dim=0, keepdim=True) | |
prompt_speech_processed = postprocess(prompt_speech_16k_tensor) | |
if prompt_speech_processed.numel() == 0: | |
raise gr.Error("處理後的聲音片段為空,請檢查原始音檔") | |
print("正在生成...") | |
output = cosyvoice.inference_zero_shot(content_text, transcription, prompt_speech_processed) | |
print("推理完成") | |
if 'tts_speech' not in output or output['tts_speech'] is None: | |
raise gr.Error("請嘗試調整輸入文本或聲音片段") | |
audio_data_tensor = output['tts_speech'] | |
if isinstance(audio_data_tensor, np.ndarray): | |
audio_data_tensor = torch.from_numpy(audio_data_tensor) | |
if audio_data_tensor.ndim == 1: | |
audio_data_tensor = audio_data_tensor.unsqueeze(0) | |
if audio_data_tensor.numel() == 0: | |
raise gr.Error("生成結果為空") | |
print("即將完成") | |
os.makedirs("results", exist_ok=True) | |
timestamp = int(time.time()) | |
out_path = f"results/output_{timestamp}.wav" | |
torchaudio.save(out_path, audio_data_tensor.cpu(), sample_rate=target_sr) | |
print(f"合成結果已保存至: {out_path}") | |
end_time = time.time() | |
print(f"合成耗時: {end_time - start_time:.2f} 秒") | |
return out_path | |
except FileNotFoundError: | |
raise gr.Error(f"找不到聲音片段檔案:{prompt_wav_path},請重新上傳或選擇") | |
except Exception as e: | |
print(f"語音合成過程中發生錯誤: {e}") | |
import traceback | |
traceback.print_exc() | |
error_message = f" ({e}) 請檢查輸入內容或稍後再試一次" | |
if "CUDA" in str(e) and "out of memory" in str(e): | |
error_message = "請嘗試合成較短的句子或稍後再試" | |
elif "break model index not valid" in str(e) or "load tokenizer failed" in str(e): | |
error_message = "錯誤" | |
raise gr.Error(error_message) | |
with gr.Blocks(title=APP_TITLE) as demo: | |
gr.Markdown(f""" | |
# 思語驛站Remembrance Station | |
透過先進的聲音技術,盡可能重現熟悉的**{CHILD_NAME}**聲音 | |
**用法:** | |
1. **選擇時光膠囊 (可選):** 從下拉選單中選擇一個預設的聲音片段 (來自**{CHILD_NAME}**不同時期的聲音) | |
2. **或自己上傳語音:** 上傳一段**{CHILD_NAME}**的清晰錄音 (建議 5-15 秒,雜音越少越好,若太短生成時會有雜音),欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項 | |
3. **記錄當時的話語 (必填):** 在下方文字框中,**準確輸入**該段錄音中的**每一句話包含標點符號** | |
4. **想聽{CHILD_NAME}說的話:** 輸入希望透過**{CHILD_NAME}**的聲音說出的句子 | |
5. **點擊「開始」:** 耐心等待聲音合成,耗時視句子長度和該時段運算資源而定需60~2000秒(例:16字,晚上11點,花400秒) | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
speaker_selector = gr.Dropdown( | |
label="選擇時光膠囊 (聲音片段)", | |
choices=[""] + list(preset_speakers.keys()), | |
value="", | |
interactive=True | |
) | |
audio_input = gr.Audio( | |
label=f"上傳/錄製{CHILD_NAME}的聲音片段", | |
sources=["upload", "microphone"], | |
type="filepath", | |
interactive=True | |
) | |
transcription_input = gr.Textbox( | |
label="聲音片段的文字記錄 (必填)", | |
placeholder=f"一字不差地輸入上方聲音片段裡{CHILD_NAME}說的每一句話包含標點符號", | |
lines=3, | |
interactive=True | |
) | |
with gr.Column(scale=2): | |
content_input = gr.Textbox( | |
label=f"想聽{CHILD_NAME}說的話", | |
placeholder="例如:天氣真好", | |
lines=5, | |
interactive=True | |
) | |
output_audio = gr.Audio( | |
label="結果", | |
type="filepath", | |
interactive=False | |
) | |
btn = gr.Button("開始") | |
speaker_selector.change( | |
fn=apply_preset, | |
inputs=speaker_selector, | |
outputs=[audio_input, transcription_input] | |
) | |
btn.click( | |
fn=synthesize_speech, | |
inputs=[audio_input, content_input, transcription_input, speaker_selector], | |
outputs=output_audio | |
) | |
gr.Markdown(f""" | |
--- | |
* 為了讓**{CHILD_NAME}**的聲音更自然,提供的「聲音片段」和「文字記錄」需要非常匹配。聲音片段建議 5~15 秒,背景噪音低的清晰人聲,標點符號也很重要 | |
* 欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項 | |
* 多人使用時要排隊,如果等待時間過長,或效果不如預期,可以嘗試使用較短的句子,或換一個聲音片段試試看 | |
* 技術仍在發展中,耗時視句子長度和該時段運算資源而定需200~2000秒,而且雖然模仿聲音,但無法複製獨一無二的人 | |
* 如果遇到困難,或想嘗試運算更快的版本https://colab.research.google.com/drive/11LmxpL4IsQ8rwjki7Qc_84qqYkGskTZI?usp=sharing | |
""") | |
if __name__ == "__main__": | |
if cosyvoice is None: | |
print("錯誤:合成引擎仔入失敗") | |
with gr.Blocks() as demo_error: | |
gr.Markdown("啟動失敗,重開試試") | |
demo_error.launch() | |
else: | |
demo.launch() |