|
from huggingface_hub import hf_hub_download |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
def load_model(): |
|
model_path = hf_hub_download( |
|
repo_id="tartuNLP/XTTS-v2-est", |
|
filename="model.pth", |
|
) |
|
config_path = hf_hub_download( |
|
repo_id="tartuNLP/XTTS-v2-est", |
|
filename="config.json", |
|
) |
|
vocab_path = hf_hub_download( |
|
repo_id="tartuNLP/XTTS-v2-est", |
|
filename="vocab.json", |
|
) |
|
|
|
config = XttsConfig() |
|
config.load_json(config_path) |
|
XTTS_MODEL = Xtts.init_from_config(config) |
|
XTTS_MODEL.load_checkpoint(config, checkpoint_path=model_path, vocab_path=vocab_path, use_deepspeed=False) |
|
XTTS_MODEL.to(device) |
|
return XTTS_MODEL |
|
|
|
model = load_model() |
|
|
|
def predict(sentence, language, reference_clip): |
|
if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']: |
|
return |
|
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( |
|
audio_path=reference_clip, |
|
gpt_cond_len=model.config.gpt_cond_len, |
|
max_ref_length=model.config.max_ref_len, |
|
sound_norm_refs=model.config.sound_norm_refs, |
|
) |
|
|
|
wav_chunks = [] |
|
for chunk in model.inference_stream( |
|
text=sentence, |
|
language=language, |
|
gpt_cond_latent=gpt_cond_latent, |
|
speaker_embedding=speaker_embedding, |
|
temperature=0.1, |
|
length_penalty=1.0, |
|
repetition_penalty=10.0, |
|
top_k=10, |
|
top_p=0.3, |
|
): |
|
if chunk is not None: |
|
wav_chunks.append(chunk) |
|
|
|
return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy()) |
|
|
|
demo = gr.Interface( |
|
title="XTTSv2-est Demo", |
|
description="To get the best results, provide a reference clip around the same length as the output sentence you want.", |
|
fn=predict, |
|
inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()], |
|
outputs=[gr.Audio()], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|