Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import torch | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import Xtts | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
def load_model(): | |
config = XttsConfig() | |
config.load_json("model/config.json") | |
XTTS_MODEL = Xtts.init_from_config(config) | |
XTTS_MODEL.load_checkpoint( | |
config, | |
checkpoint_path="model/model.pth", | |
vocab_path="model/vocab.json", | |
eval=True, | |
use_deepspeed=False | |
) | |
XTTS_MODEL.to(device) | |
return XTTS_MODEL | |
model = load_model() | |
def predict(sentence, language, reference_clip): | |
if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']: | |
return | |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( | |
audio_path=reference_clip, | |
gpt_cond_len=model.config.gpt_cond_len, | |
max_ref_length=model.config.max_ref_len, | |
sound_norm_refs=model.config.sound_norm_refs, | |
) | |
wav_chunks = [] | |
for chunk in model.inference_stream( | |
text=sentence, | |
language=language, | |
gpt_cond_latent=gpt_cond_latent, | |
speaker_embedding=speaker_embedding, | |
temperature=model.config.temperature, | |
length_penalty=model.config.length_penalty, | |
repetition_penalty=model.config.repetition_penalty, | |
top_k=model.config.top_k, | |
top_p=model.config.top_p, | |
): | |
if chunk is not None: | |
wav_chunks.append(chunk) | |
return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy()) | |
demo = gr.Interface( | |
title="XTTSv2-est Demo", | |
description="To get the best results, provide a reference clip around the same length as the output sentence you want.", | |
fn=predict, | |
inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()], | |
outputs=[gr.Audio()], | |
) | |
if __name__ == "__main__": | |
demo.queue() | |
demo.launch() | |