import gradio as gr import numpy as np import torch from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts device = "cuda:0" if torch.cuda.is_available() else "cpu" def load_model(): config = XttsConfig() config.load_json("model/config.json") XTTS_MODEL = Xtts.init_from_config(config) XTTS_MODEL.load_checkpoint( config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json", eval=True, use_deepspeed=False ) XTTS_MODEL.to(device) return XTTS_MODEL model = load_model() #model.cuda() def predict(sentence, language, reference_clip): if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']: return gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( audio_path=reference_clip, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs, ) wav_chunks = [] for chunk in model.inference_stream( text=sentence, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, temperature=model.config.temperature, length_penalty=model.config.length_penalty, repetition_penalty=model.config.repetition_penalty, top_k=model.config.top_k, top_p=model.config.top_p, ): if chunk is not None: wav_chunks.append(chunk) return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy()) demo = gr.Interface( title="XTTSv2-est Demo", description="To get the best results, provide a reference clip around the same length as the output sentence you want.", fn=predict, inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()], outputs=[gr.Audio()], ) if __name__ == "__main__": demo.queue() demo.launch()