from huggingface_hub import hf_hub_download import gradio as gr import numpy as np import torch from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts device = "cuda:0" if torch.cuda.is_available() else "cpu" def load_model(): model_path = hf_hub_download( repo_id="tartuNLP/XTTS-v2-est", filename="model.pth", ) config_path = hf_hub_download( repo_id="tartuNLP/XTTS-v2-est", filename="config.json", ) vocab_path = hf_hub_download( repo_id="tartuNLP/XTTS-v2-est", filename="vocab.json", ) config = XttsConfig() config.load_json(config_path) XTTS_MODEL = Xtts.init_from_config(config) XTTS_MODEL.load_checkpoint(config, checkpoint_path=model_path, vocab_path=vocab_path, use_deepspeed=False) XTTS_MODEL.to(device) return XTTS_MODEL model = load_model() def predict(sentence, language, reference_clip): if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']: return gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( audio_path=reference_clip, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs, ) wav_chunks = [] for chunk in model.inference_stream( text=sentence, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, temperature=0.1, length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3, ): if chunk is not None: wav_chunks.append(chunk) return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy()) demo = gr.Interface( title="XTTSv2-est Demo", description="To get the best results, provide a reference clip around the same length as the output sentence you want.", fn=predict, inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()], outputs=[gr.Audio()], ) if __name__ == "__main__": demo.launch()