import os import sys import gradio as gr import numpy as np import torch import librosa import torchaudio import time import requests from huggingface_hub import snapshot_download ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS') from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav APP_TITLE = "思語驛站 (Remembrance Station)" CHILD_NAME = "小睿" preset_speakers = { f"{CHILD_NAME}(6歲)": { "url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/smile_train.wav", "transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來" }, f"{CHILD_NAME}(8歲)": { "url": "https://huggingface.co/datasets/kanahomaisa/Remembrance_Station_Samples/resolve/main/utensils.wav", "transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?" } } def download_audio_from_hf(url, save_path="temp_prompt.wav"): try: response = requests.get(url, timeout=15) response.raise_for_status() with open(save_path, "wb") as f: f.write(response.content) return save_path except requests.exceptions.RequestException as e: print(f"下載音檔失敗: {url}, 錯誤: {e}") raise gr.Error(f"無法載入預設聲音片段({e})") def apply_preset(speaker_key): if speaker_key and speaker_key in preset_speakers: try: url = preset_speakers[speaker_key]["url"] transcription = preset_speakers[speaker_key]["transcription"] local_path = download_audio_from_hf(url) return local_path, transcription except gr.Error as e: print(f"無法載入預設聲音: {e}") return None, "" except Exception as e: print(f"套用預設時發生錯誤: {e}") print(f"處理預設聲音時發生錯誤,請稍後再試") return None, "" return None, "" max_val = 0.8 target_sr = 22050 prompt_sr = 16000 def download_with_retry(repo_id, max_retries=5, wait_sec=5): for attempt in range(1, max_retries + 1): try: print(f"正在下載模型(嘗試 {attempt}/{max_retries})") model_dir = snapshot_download(repo_id, cache_dir="./hf_cache") print("準備完成") return model_dir except Exception as e: print(f"第 {attempt} 次準備失敗:{e}") if attempt < max_retries: print(f"等待 {wait_sec} 秒後重試...") time.sleep(wait_sec) else: print("已達最大重試次數") raise ConnectionError(f"無法下載模型 {repo_id}") try: model_dir = download_with_retry("MediaTek-Research/BreezyVoice") cosyvoice = CosyVoice(model_dir) print("準備完成") except ConnectionError as e: print(f"初始化錯誤: {e}") cosyvoice = None except Exception as e: print(f"初始化過程中發生未知錯誤: {e}") import traceback traceback.print_exc() cosyvoice = None def postprocess(speech, top_db=60, hop_length=220, win_length=440): try: if isinstance(speech, np.ndarray): speech = torch.from_numpy(speech) if speech.ndim == 1: speech = speech.unsqueeze(0) speech_np = speech.squeeze().cpu().numpy() if np.max(np.abs(speech_np)) < 1e-5: print("警告: 輸入音頻近乎靜音,跳過修剪") trimmed_speech_np = speech_np else: trimmed_speech_np, _ = librosa.effects.trim( speech_np, top_db=top_db, frame_length=win_length, hop_length=hop_length ) trimmed_speech = torch.from_numpy(trimmed_speech_np).unsqueeze(0) if trimmed_speech.numel() > 0 and trimmed_speech.abs().max() > 1e-5: abs_max = trimmed_speech.abs().max() if abs_max > max_val: trimmed_speech = trimmed_speech / abs_max * max_val else: print("警告: 修剪後的音頻近乎靜音或為空,跳過標準化") silence = torch.zeros(1, int(target_sr * 0.2), dtype=trimmed_speech.dtype) speech_final = torch.concat([trimmed_speech.to(silence.device), silence], dim=1) return speech_final except Exception as e: print(f"音訊後處理錯誤: {e}") import traceback traceback.print_exc() print("警告: 後處理失敗,可能影響輸出音質。返回原始處理前音頻") if speech.ndim == 1: return speech.unsqueeze(0) return speech def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key): if cosyvoice is None: raise gr.Error("啟動失敗,請重啟") start_time = time.time() print("開始準備聲音片段") prompt_wav_path = None transcription = "" if speaker_key and speaker_key in preset_speakers: print(f"使用預設樣本: {speaker_key}") try: preset_url = preset_speakers[speaker_key]["url"] preset_transcription = preset_speakers[speaker_key]["transcription"] prompt_wav_path = "temp_prompt.wav" if not os.path.exists(prompt_wav_path) or speaker_key != getattr(synthesize_speech, '_last_preset_key', None): print(f"下載或更新預設樣本: {speaker_key}") prompt_wav_path = download_audio_from_hf(preset_url, prompt_wav_path) synthesize_speech._last_preset_key = speaker_key else: print("使用已下載的預設樣本") transcription = preset_transcription if not speaker_text.strip() else speaker_text.strip() except Exception as e: raise gr.Error(f"處理預設樣本時出錯: {e}") elif speaker_audio_path: print(f"使用上傳樣本: {speaker_audio_path}") prompt_wav_path = speaker_audio_path transcription = speaker_text.strip() else: raise gr.Error("請選擇一個預設聲音,或上傳聲音片段") if not prompt_wav_path or not os.path.exists(prompt_wav_path): raise gr.Error(f"聲音片段未能載入 ({prompt_wav_path}),請重試。") if not transcription or len(transcription) < 5: raise gr.Error("聲音片段的文字記錄太短或未提供,請輸入至少5個字。準確的記錄有助於生成更自然的聲音") if not content_text or not content_text.strip(): raise gr.Error("請輸入想聽到的句子") print(f"聲音樣本路徑: {prompt_wav_path}") print(f"樣本文字記錄: {transcription}") print(f"目標合成文字: {content_text}") try: print("正在分析聲音") prompt_speech_orig, sr = torchaudio.load(prompt_wav_path) if sr != prompt_sr: print(f"轉換提示音頻採樣率: {sr} -> {prompt_sr}") resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=prompt_sr) prompt_speech_16k_tensor = resampler(prompt_speech_orig) else: prompt_speech_16k_tensor = prompt_speech_orig if prompt_speech_16k_tensor.shape[0] > 1: prompt_speech_16k_tensor = torch.mean(prompt_speech_16k_tensor, dim=0, keepdim=True) prompt_speech_processed = postprocess(prompt_speech_16k_tensor) if prompt_speech_processed.numel() == 0: raise gr.Error("處理後的聲音片段為空,請檢查原始音檔") print("正在生成...") output = cosyvoice.inference_zero_shot(content_text, transcription, prompt_speech_processed) print("推理完成") if 'tts_speech' not in output or output['tts_speech'] is None: raise gr.Error("請嘗試調整輸入文本或聲音片段") audio_data_tensor = output['tts_speech'] if isinstance(audio_data_tensor, np.ndarray): audio_data_tensor = torch.from_numpy(audio_data_tensor) if audio_data_tensor.ndim == 1: audio_data_tensor = audio_data_tensor.unsqueeze(0) if audio_data_tensor.numel() == 0: raise gr.Error("生成結果為空") print("即將完成") os.makedirs("results", exist_ok=True) timestamp = int(time.time()) out_path = f"results/output_{timestamp}.wav" torchaudio.save(out_path, audio_data_tensor.cpu(), sample_rate=target_sr) print(f"合成結果已保存至: {out_path}") end_time = time.time() print(f"合成耗時: {end_time - start_time:.2f} 秒") return out_path except FileNotFoundError: raise gr.Error(f"找不到聲音片段檔案:{prompt_wav_path},請重新上傳或選擇") except Exception as e: print(f"語音合成過程中發生錯誤: {e}") import traceback traceback.print_exc() error_message = f" ({e}) 請檢查輸入內容或稍後再試一次" if "CUDA" in str(e) and "out of memory" in str(e): error_message = "請嘗試合成較短的句子或稍後再試" elif "break model index not valid" in str(e) or "load tokenizer failed" in str(e): error_message = "錯誤" raise gr.Error(error_message) with gr.Blocks(title=APP_TITLE) as demo: gr.Markdown(f""" # 思語驛站Remembrance Station 透過先進的聲音技術,盡可能重現熟悉的**{CHILD_NAME}**聲音 **用法:** 1. **選擇時光膠囊 (可選):** 從下拉選單中選擇一個預設的聲音片段 (來自**{CHILD_NAME}**不同時期的聲音) 2. **或自己上傳語音:** 上傳一段**{CHILD_NAME}**的清晰錄音 (建議 5-15 秒,雜音越少越好,若太短生成時會有雜音),欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項 3. **記錄當時的話語 (必填):** 在下方文字框中,**準確輸入**該段錄音中的**每一句話包含標點符號** 4. **想聽{CHILD_NAME}說的話:** 輸入希望透過**{CHILD_NAME}**的聲音說出的句子 5. **點擊「開始」:** 耐心等待聲音合成,耗時視句子長度和該時段運算資源而定需60~2000秒(例:16字,晚上11點,花400秒) """) with gr.Row(): with gr.Column(scale=1): speaker_selector = gr.Dropdown( label="選擇時光膠囊 (聲音片段)", choices=[""] + list(preset_speakers.keys()), value="", interactive=True ) audio_input = gr.Audio( label=f"上傳/錄製{CHILD_NAME}的聲音片段", sources=["upload", "microphone"], type="filepath", interactive=True ) transcription_input = gr.Textbox( label="聲音片段的文字記錄 (必填)", placeholder=f"一字不差地輸入上方聲音片段裡{CHILD_NAME}說的每一句話包含標點符號", lines=3, interactive=True ) with gr.Column(scale=2): content_input = gr.Textbox( label=f"想聽{CHILD_NAME}說的話", placeholder="例如:天氣真好", lines=5, interactive=True ) output_audio = gr.Audio( label="結果", type="filepath", interactive=False ) btn = gr.Button("開始") speaker_selector.change( fn=apply_preset, inputs=speaker_selector, outputs=[audio_input, transcription_input] ) btn.click( fn=synthesize_speech, inputs=[audio_input, content_input, transcription_input, speaker_selector], outputs=output_audio ) gr.Markdown(f""" --- * 為了讓**{CHILD_NAME}**的聲音更自然,提供的「聲音片段」和「文字記錄」需要非常匹配。聲音片段建議 5~15 秒,背景噪音低的清晰人聲,標點符號也很重要 * 欲上傳或使用錄製音訊時,要先在聲音片段的下拉選單選空白選項 * 多人使用時要排隊,如果等待時間過長,或效果不如預期,可以嘗試使用較短的句子,或換一個聲音片段試試看 * 技術仍在發展中,耗時視句子長度和該時段運算資源而定需200~2000秒,而且雖然模仿聲音,但無法複製獨一無二的人 * 如果遇到困難,或想嘗試運算更快的版本https://colab.research.google.com/drive/11LmxpL4IsQ8rwjki7Qc_84qqYkGskTZI?usp=sharing """) if __name__ == "__main__": if cosyvoice is None: print("錯誤:合成引擎仔入失敗") with gr.Blocks() as demo_error: gr.Markdown("啟動失敗,重開試試") demo_error.launch() else: demo.launch()