Spaces:
Runtime error
Runtime error
| import logging | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| logging.getLogger('urllib3').setLevel(logging.WARNING) | |
| import romajitable | |
| import re | |
| import numpy as np | |
| import IPython.display as ipd | |
| import torch | |
| import commons | |
| import utils | |
| from models import SynthesizerTrn | |
| from text.symbols import symbols | |
| from text import text_to_sequence | |
| import gradio as gr | |
| import time | |
| import datetime | |
| import os | |
| import librosa | |
| from mel_processing import spectrogram_torch | |
| class VitsGradio: | |
| def __init__(self): | |
| self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
| self.lan = ["中文","日文","自动","手动"] | |
| self.idols = ["chinese1","chinese2","chinese3","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"] | |
| self.modelPaths = [] | |
| for root,dirs,files in os.walk("checkpoints"): | |
| for dir in dirs: | |
| self.modelPaths.append(dir) | |
| with gr.Blocks() as self.Vits: | |
| gr.Markdown( | |
| "## <center> Lovelive虹团中日双语VITS\n" | |
| "### <center> 请不要生成会对个人以及企划造成侵害的内容\n" | |
| "<div align='center'>目前有虹团标贝普通话版(biaobei),虹团模型(default),少歌模型(ShojoKageki)以及混合模型(tmp)</div>" | |
| '<div align="center"><a>参数说明:默认参数适合汉语普通话,合成日语时建议将噪声比例调节至0.667,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>' | |
| '<div align="center"><a>合成前请先选择模型,建议选择tmp模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>') | |
| with gr.Tab("TTS合成"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了") | |
| input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True) | |
| input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True) | |
| btnVC = gr.Button("Submit") | |
| with gr.Column(): | |
| input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267) | |
| input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7) | |
| input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1) | |
| output1 = gr.Audio(label="采样率22050") | |
| btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1]) | |
| with gr.Tab("选择模型"): | |
| with gr.Column(): | |
| modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value") | |
| btnMod = gr.Button("载入模型") | |
| statusa = gr.TextArea() | |
| btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa]) | |
| with gr.Tab("Voice Conversion"): | |
| gr.Markdown(""" | |
| 录制或上传声音,并选择要转换的音色。 | |
| """) | |
| with gr.Column(): | |
| record_audio = gr.Audio(label="record your voice", source="microphone") | |
| upload_audio = gr.Audio(label="or upload audio here", source="upload") | |
| source_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="source speaker") | |
| target_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="target speaker") | |
| with gr.Column(): | |
| message_box = gr.Textbox(label="Message") | |
| converted_audio = gr.Audio(label='converted audio') | |
| btn = gr.Button("Convert!") | |
| btn.click(self.vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio], | |
| outputs=[message_box, converted_audio]) | |
| with gr.Tab("小说合成(带字幕)"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了") | |
| input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True) | |
| input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True) | |
| btnVC = gr.Button("Submit") | |
| with gr.Column(): | |
| input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267) | |
| input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7) | |
| input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1) | |
| output1 = gr.Audio(label="采样率22050") | |
| subtitle = gr.outputs.File(label="字幕文件:subtitles.srt") | |
| btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle]) | |
| def loadCk(self,path): | |
| self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") | |
| self.net_g = SynthesizerTrn( | |
| len(symbols), | |
| self.hps.data.filter_length // 2 + 1, | |
| self.hps.train.segment_size // self.hps.data.hop_length, | |
| n_speakers=self.hps.data.n_speakers, | |
| **self.hps.model).to(self.dev) | |
| _ = self.net_g.eval() | |
| _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g) | |
| return "success" | |
| def get_text(self,text): | |
| text_norm = text_to_sequence(text,self.hps.data.text_cleaners) | |
| if self.hps.data.add_blank: | |
| text_norm = commons.intersperse(text_norm, 0) | |
| text_norm = torch.LongTensor(text_norm) | |
| return text_norm | |
| def is_japanese(self,string): | |
| for ch in string: | |
| if ord(ch) > 0x3040 and ord(ch) < 0x30FF: | |
| return True | |
| return False | |
| def is_english(self,string): | |
| import re | |
| pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') | |
| if pattern.fullmatch(string): | |
| return True | |
| else: | |
| return False | |
| def selection(self,speaker): | |
| if speaker == "高咲侑": | |
| spk = 0 | |
| return spk | |
| elif speaker == "歩夢": | |
| spk = 1 | |
| return spk | |
| elif speaker == "かすみ": | |
| spk = 2 | |
| return spk | |
| elif speaker == "しずく": | |
| spk = 3 | |
| return spk | |
| elif speaker == "果林": | |
| spk = 4 | |
| return spk | |
| elif speaker == "愛": | |
| spk = 5 | |
| return spk | |
| elif speaker == "彼方": | |
| spk = 6 | |
| return spk | |
| elif speaker == "せつ菜": | |
| spk = 7 | |
| return spk | |
| elif speaker == "エマ": | |
| spk = 8 | |
| return spk | |
| elif speaker == "璃奈": | |
| spk = 9 | |
| return spk | |
| elif speaker == "栞子": | |
| spk = 10 | |
| return spk | |
| elif speaker == "ランジュ": | |
| spk = 11 | |
| return spk | |
| elif speaker == "ミア": | |
| spk = 12 | |
| return spk | |
| elif speaker == "chinese1": | |
| spk = 16 | |
| return spk | |
| elif speaker == "chinese2": | |
| spk = 18 | |
| return spk | |
| elif speaker == "chinese3": | |
| spk = 19 | |
| return spk | |
| elif speaker == "華恋": | |
| spk = 21 | |
| return spk | |
| elif speaker == "まひる": | |
| spk = 22 | |
| return spk | |
| elif speaker == "なな": | |
| spk = 23 | |
| return spk | |
| elif speaker == "クロディーヌ": | |
| spk = 24 | |
| return spk | |
| elif speaker == "ひかり": | |
| spk = 25 | |
| return spk | |
| elif speaker == "純那": | |
| spk = 26 | |
| return spk | |
| elif speaker == "香子": | |
| spk = 27 | |
| return spk | |
| elif speaker == "真矢": | |
| spk = 28 | |
| return spk | |
| elif speaker == "双葉": | |
| spk = 29 | |
| return spk | |
| elif speaker == "ミチル": | |
| spk = 30 | |
| return spk | |
| elif speaker == "メイファン": | |
| spk = 31 | |
| return spk | |
| elif speaker == "やちよ": | |
| spk = 32 | |
| return spk | |
| elif speaker == "晶": | |
| spk = 33 | |
| return spk | |
| elif speaker == "いちえ": | |
| spk = 34 | |
| return spk | |
| elif speaker == "ゆゆ子": | |
| spk = 35 | |
| return spk | |
| elif speaker == "塁": | |
| spk = 36 | |
| return spk | |
| elif speaker == "珠緒": | |
| spk = 37 | |
| return spk | |
| elif speaker == "あるる": | |
| spk = 38 | |
| return spk | |
| elif speaker == "ララフィン": | |
| spk = 39 | |
| return spk | |
| elif speaker == "美空": | |
| spk = 40 | |
| return spk | |
| elif speaker == "静羽": | |
| spk = 41 | |
| return spk | |
| else: | |
| return 0 | |
| def sle(self,language,text): | |
| text = text.replace('\n','。').replace(' ',',') | |
| if language == "中文": | |
| tts_input1 = "[ZH]" + text + "[ZH]" | |
| return tts_input1 | |
| elif language == "自动": | |
| tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]" | |
| return tts_input1 | |
| elif language == "日文": | |
| tts_input1 = "[JA]" + text + "[JA]" | |
| return tts_input1 | |
| elif language == "英文": | |
| tts_input1 = "[EN]" + text + "[EN]" | |
| return tts_input1 | |
| elif language == "手动": | |
| return text | |
| def extrac(self,text): | |
| text = re.sub("<[^>]*>","",text) | |
| result_list = re.split(r'\n', text) | |
| final_list = [] | |
| for i in result_list: | |
| if self.is_english(i): | |
| i = romajitable.to_kana(i).katakana | |
| i = i.replace('\n','').replace(' ','') | |
| #Current length of single sentence: 20 | |
| ''' | |
| if len(i)>1: | |
| if len(i) > 20: | |
| try: | |
| cur_list = re.split(r'。|!', i) | |
| for i in cur_list: | |
| if len(i)>1: | |
| final_list.append(i+'。') | |
| except: | |
| pass | |
| else: | |
| final_list.append(i) | |
| ''' | |
| try: | |
| final_list.append(i) | |
| except: | |
| pass | |
| final_list = [x for x in final_list if x != ''] | |
| print(final_list) | |
| return final_list | |
| def vc_fn(self,original_speaker, target_speaker, record_audio, upload_audio): | |
| input_audio = record_audio if record_audio is not None else upload_audio | |
| if input_audio is None: | |
| return "You need to record or upload an audio", None | |
| sampling_rate, audio = input_audio | |
| original_speaker_id = self.selection(original_speaker) | |
| target_speaker_id = self.selection(target_speaker) | |
| audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
| if len(audio.shape) > 1: | |
| audio = librosa.to_mono(audio.transpose(1, 0)) | |
| if sampling_rate != self.hps.data.sampling_rate: | |
| audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=self.hps.data.sampling_rate) | |
| with torch.no_grad(): | |
| y = torch.FloatTensor(audio) | |
| y = y / max(-y.min(), y.max()) / 0.99 | |
| y = y.to(self.dev) | |
| y = y.unsqueeze(0) | |
| spec = spectrogram_torch(y, self.hps.data.filter_length, | |
| self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, | |
| center=False).to(self.dev) | |
| spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.dev) | |
| sid_src = torch.LongTensor([original_speaker_id]).to(self.dev) | |
| sid_tgt = torch.LongTensor([target_speaker_id]).to(self.dev) | |
| audio = self.net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][ | |
| 0, 0].data.cpu().float().numpy() | |
| del y, spec, spec_lengths, sid_src, sid_tgt | |
| return "Success", (self.hps.data.sampling_rate, audio) | |
| def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1): | |
| try: | |
| speaker_id = int(self.selection(speaker_id)) | |
| t1 = time.time() | |
| stn_tst = self.get_text(self.sle(language,text)) | |
| with torch.no_grad(): | |
| x_tst = stn_tst.unsqueeze(0).to(self.dev) | |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev) | |
| sid = torch.LongTensor([speaker_id]).to(self.dev) | |
| audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() | |
| t2 = time.time() | |
| spending_time = "推理时间为:"+str(t2-t1)+"s" | |
| print(spending_time) | |
| return (self.hps.data.sampling_rate, audio) | |
| except: | |
| self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json") | |
| self.net_g = SynthesizerTrn( | |
| len(symbols), | |
| self.hps.data.filter_length // 2 + 1, | |
| self.hps.train.segment_size // self.hps.data.hop_length, | |
| n_speakers=self.hps.data.n_speakers, | |
| **self.hps.model).to(self.dev) | |
| _ = self.net_g.eval() | |
| _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g) | |
| def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1): | |
| speaker_id = int(self.selection(speaker_id)) | |
| a = ['【','[','(','('] | |
| b = ['】',']',')',')'] | |
| for i in a: | |
| text = text.replace(i,'<') | |
| for i in b: | |
| text = text.replace(i,'>') | |
| final_list = self.extrac(text.replace('“','').replace('”','')) | |
| audio_fin = [] | |
| c = 0 | |
| t = datetime.timedelta(seconds=0) | |
| f1 = open("subtitles.srt",'w',encoding='utf-8') | |
| for sentence in final_list: | |
| c +=1 | |
| stn_tst = self.get_text(self.sle(language,sentence)) | |
| with torch.no_grad(): | |
| x_tst = stn_tst.unsqueeze(0).to(self.dev) | |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev) | |
| sid = torch.LongTensor([speaker_id]).to(self.dev) | |
| t1 = time.time() | |
| audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() | |
| t2 = time.time() | |
| spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" | |
| print(spending_time) | |
| time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] | |
| last_time = datetime.timedelta(seconds=len(audio)/float(22050)) | |
| t+=last_time | |
| time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] | |
| print(time_end) | |
| f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') | |
| audio_fin.append(audio) | |
| file_path = "subtitles.srt" | |
| return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path | |
| print("开始部署") | |
| grVits = VitsGradio() | |
| grVits.Vits.launch() |