from huggingface_hub import hf_hub_download
import gradio as gr
import numpy as np
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

device = "cuda:0" if torch.cuda.is_available() else "cpu"

def load_model():
    model_path = hf_hub_download(
        repo_id="tartuNLP/XTTS-v2-est",
        filename="model.pth",
    )
    config_path = hf_hub_download(
        repo_id="tartuNLP/XTTS-v2-est",
        filename="config.json",
    )
    vocab_path = hf_hub_download(
        repo_id="tartuNLP/XTTS-v2-est",
        filename="vocab.json",
    )

    config = XttsConfig()
    config.load_json(config_path)
    XTTS_MODEL = Xtts.init_from_config(config)
    XTTS_MODEL.load_checkpoint(config, checkpoint_path=model_path, vocab_path=vocab_path, use_deepspeed=False)
    XTTS_MODEL.to(device)
    return XTTS_MODEL

model = load_model()

def predict(sentence, language, reference_clip):
    if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']:
        return
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
        audio_path=reference_clip,
        gpt_cond_len=model.config.gpt_cond_len,
        max_ref_length=model.config.max_ref_len,
        sound_norm_refs=model.config.sound_norm_refs,
    )
    
    wav_chunks = []
    for chunk in model.inference_stream(
        text=sentence,
        language=language,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1,
        length_penalty=1.0,
        repetition_penalty=10.0,
        top_k=10,
        top_p=0.3,
    ):
        if chunk is not None:
            wav_chunks.append(chunk)
    
    return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy())

demo = gr.Interface(
    title="XTTSv2-est Demo",
    description="To get the best results, provide a reference clip around the same length as the output sentence you want.",
    fn=predict,
    inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()],
    outputs=[gr.Audio()],
)

if __name__ == "__main__":
    demo.launch()