Spaces:
Runtime error
Runtime error
| import subprocess | |
| import random | |
| import os | |
| from pathlib import Path | |
| import librosa | |
| from scipy.io import wavfile | |
| import numpy as np | |
| import torch | |
| import csv | |
| import whisper | |
| import gradio as gr | |
| os.system("pip install --upgrade Cython==0.29.35") | |
| os.system("pip install pysptk --no-build-isolation") | |
| os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") | |
| os.system("pip install tts-autolabel -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") | |
| import sox | |
| def split_long_audio(model, filepaths, save_dir="data_dir", out_sr=44100): | |
| if isinstance(filepaths, str): | |
| filepaths = [filepaths] | |
| for file_idx, filepath in enumerate(filepaths): | |
| save_path = Path(save_dir) | |
| save_path.mkdir(exist_ok=True, parents=True) | |
| print(f"Transcribing file {file_idx}: '{filepath}' to segments...") | |
| result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5) | |
| segments = result['segments'] | |
| wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True) | |
| wav, _ = librosa.effects.trim(wav, top_db=20) | |
| peak = np.abs(wav).max() | |
| if peak > 1.0: | |
| wav = 0.98 * wav / peak | |
| wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr) | |
| wav2 /= max(wav2.max(), -wav2.min()) | |
| for i, seg in enumerate(segments): | |
| start_time = seg['start'] | |
| end_time = seg['end'] | |
| wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)] | |
| wav_seg_name = f"{file_idx}_{i}.wav" | |
| out_fpath = save_path / wav_seg_name | |
| wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16)) | |
| whisper_size = "medium" | |
| whisper_model = whisper.load_model(whisper_size) | |
| from modelscope.tools import run_auto_label | |
| from modelscope.models.audio.tts import SambertHifigan | |
| from modelscope.pipelines import pipeline | |
| from modelscope.utils.constant import Tasks | |
| from modelscope.metainfo import Trainers | |
| from modelscope.trainers import build_trainer | |
| from modelscope.utils.audio.audio_utils import TtsTrainType | |
| pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k' | |
| dataset_id = "/home/user/app/output_training_data/" | |
| pretrain_work_dir = "/home/user/app/pretrain_work_dir/" | |
| def auto_label(Voicetoclone, VoiceMicrophone): | |
| if VoiceMicrophone is not None: | |
| audio = VoiceMicrophone | |
| else: | |
| audio = Voicetoclone | |
| try: | |
| split_long_audio(whisper_model, audio, "/home/user/app/test_wavs/") | |
| input_wav = "/home/user/app/test_wavs/" | |
| output_data = "/home/user/app/output_training_data/" | |
| ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7") | |
| except Exception: | |
| pass | |
| return "标注成功" | |
| def train(a): | |
| try: | |
| train_info = { | |
| TtsTrainType.TRAIN_TYPE_SAMBERT: { # 配置训练AM(sambert)模型 | |
| 'train_steps': 52, # 训练多少个step | |
| 'save_interval_steps': 50, # 每训练多少个step保存一次checkpoint | |
| 'log_interval': 10 # 每训练多少个step打印一次训练日志 | |
| } | |
| } | |
| # 配置训练参数,指定数据集,临时工作目录和train_info | |
| kwargs = dict( | |
| model=pretrained_model_id, # 指定要finetune的模型 | |
| model_revision = "v1.0.6", | |
| work_dir=pretrain_work_dir, # 指定临时工作目录 | |
| train_dataset=dataset_id, # 指定数据集id | |
| train_type=train_info # 指定要训练类型及参数 | |
| ) | |
| trainer = build_trainer(Trainers.speech_kantts_trainer, | |
| default_args=kwargs) | |
| trainer.train() | |
| except Exception: | |
| pass | |
| return "训练完成" | |
| import random | |
| def infer(text): | |
| model_dir = "/home/user/app/pretrain_work_dir/" | |
| custom_infer_abs = { | |
| 'voice_name': | |
| 'F7', | |
| 'am_ckpt': | |
| os.path.join(model_dir, 'tmp_am', 'ckpt'), | |
| 'am_config': | |
| os.path.join(model_dir, 'tmp_am', 'config.yaml'), | |
| 'voc_ckpt': | |
| os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), | |
| 'voc_config': | |
| os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', | |
| 'config.yaml'), | |
| 'audio_config': | |
| os.path.join(model_dir, 'data', 'audio_config.yaml'), | |
| 'se_file': | |
| os.path.join(model_dir, 'data', 'se', 'se.npy') | |
| } | |
| kwargs = {'custom_ckpt': custom_infer_abs} | |
| model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs) | |
| inference = pipeline(task=Tasks.text_to_speech, model=model_id) | |
| output = inference(input=text) | |
| filename = str(random.randint(1, 1000000000000)) | |
| with open(filename + "myfile.wav", mode='bx') as f: | |
| f.write(output["output_wav"]) | |
| return filename + "myfile.wav" | |
| from textwrap import dedent | |
| app = gr.Blocks() | |
| with app: | |
| gr.Markdown("# <center>🥳🎶🎡 - Sambert中文声音克隆</center>") | |
| gr.Markdown("## <center>🌟 - 训练3分钟,推理5秒钟,中英真实拟声 </center>") | |
| gr.Markdown("### <center>🌊 - 更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>") | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp1 = gr.Audio(type="filepath", source="upload", label="方案一:请从本地上传一段语音") | |
| inp_micro = gr.Audio(type="filepath", source="microphone", label="方案二:请用麦克风录制您的声音") | |
| with gr.Column(): | |
| out1 = gr.Textbox(label="标注情况", lines=1, interactive=False) | |
| out2 = gr.Textbox(label="训练情况", lines=1, interactive=False) | |
| inp2 = gr.Textbox(label="请在这里填写您想合成的文本", placeholder="想说却还没说的 还很多...", lines=3) | |
| with gr.Column(): | |
| out3 = gr.Audio(type="filepath", label="为您合成的专属音频") | |
| with gr.Row(): | |
| btn1 = gr.Button("1.标注数据") | |
| btn2 = gr.Button("2.开始训练") | |
| btn3 = gr.Button("3.一键推理", variant="primary") | |
| btn1.click(auto_label, [inp1, inp_micro], out1) | |
| btn2.click(train, out1, out2) | |
| btn3.click(infer, inp2, out3) | |
| with gr.Accordion("📒 使用指南", open=True): | |
| _ = f""" 如何使用此程序: | |
| * 使用方案一或方案二,上传一分钟左右的语音后,依次点击“1.标注数据”、“2.开始训练”、“3.一键推理”即可开启声音克隆之旅 | |
| * 选择两个方案中的一个即可,程序会优先使用麦克风上传的语音;如果您需要从本地上传语音文件,请不要同时用方案二录制语音 | |
| * 您可以随时编辑想要合成的文本内容,但请不要生成会对个人以及组织造成侵害的内容 | |
| * 如果您需要用方案二录制您的声音,以下是一段长度合适的文本,供您朗读并录制: | |
| 我看到鸟儿飞到天空,它们飞得多快呀。明天它们再飞过同样的路线,也永远不是今天了。或许明天飞过这条路线的,不是老鸟,而是小鸟了。时间过得飞快,使我小心眼里不只是着急,还有悲伤。有一天我放学回家,看到太阳快落山了,就下决心说:“我要比太阳更快地回家。”我狂奔回去,站在庭院里喘气的时候,看到太阳还露着半边脸,我高兴地跳起来。那一天我跑赢了太阳。以后我常做这样的游戏,有时和太阳赛跑,有时和西北风比赛,有时一个暑假的作业,我十天就做完了。那时我三年级,常把哥哥五年级的作业拿来做。后来的二十年里,我因此受益无穷。虽然我知道人永远跑不过时间,但是可以比原来快跑几步。那几步虽然很小很小,但作用却很大很大。如果将来我有什么要教给我的孩子,我会告诉他:假若你一直和时间赛跑,你就可以成功。 | |
| """ | |
| gr.Markdown(dedent(_)) | |
| gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>") | |
| gr.HTML(''' | |
| <div class="footer"> | |
| <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘 | |
| </p> | |
| </div> | |
| ''') | |
| app.launch(show_error=True) | |