XTTSv2-est / app_local.py
Rasmus Lellep
gradio changes
86125cc
raw
history blame
2.02 kB
import gradio as gr
import numpy as np
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def load_model():
config = XttsConfig()
config.load_json("model/config.json")
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(
config,
checkpoint_path="model/model.pth",
vocab_path="model/vocab.json",
eval=True,
use_deepspeed=False
)
XTTS_MODEL.to(device)
return XTTS_MODEL
model = load_model()
def predict(sentence, language, reference_clip):
if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']:
return
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=reference_clip,
gpt_cond_len=model.config.gpt_cond_len,
max_ref_length=model.config.max_ref_len,
sound_norm_refs=model.config.sound_norm_refs,
)
wav_chunks = []
for chunk in model.inference_stream(
text=sentence,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=model.config.temperature,
length_penalty=model.config.length_penalty,
repetition_penalty=model.config.repetition_penalty,
top_k=model.config.top_k,
top_p=model.config.top_p,
):
if chunk is not None:
wav_chunks.append(chunk)
return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy())
demo = gr.Interface(
title="XTTSv2-est Demo",
description="To get the best results, provide a reference clip around the same length as the output sentence you want.",
fn=predict,
inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()],
outputs=[gr.Audio()],
)
if __name__ == "__main__":
demo.queue()
demo.launch()