import gradio as gr
import subprocess
import os
import sys
import soundfile as sf
import numpy as np
import torch.cuda

repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
repo_dir = "StyleTTS2-lite-vi"

if not os.path.exists(repo_dir):
    subprocess.run(["git", "clone", repo_url, repo_dir])

# Clone repo and load model
sys.path.append(os.path.abspath(repo_dir))
from inference import StyleTTS2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config_path = os.path.join(repo_dir, "Models", "config.yml")
models_path = os.path.join(repo_dir, "Models", "model_vi_en.pth")
model = StyleTTS2(config_path, models_path).to(device)

# Core inference function
def process_inputs(text_prompt, reference_audio_paths,
                   n_merge, randomness, smooth_dur,
                   denoise, t_denoise, split_dur):
    
    speakers = {}
    for i, path in enumerate(reference_audio_paths, 1):
        speaker_id = f"id_{i}"
        speakers[speaker_id] = {
            "path": path,
            "lang": "vi",
            "speed": 1.1
    }

    # Synthesize audio
    r = model.generate(
        text_prompt, speakers, denoise, t_denoise,
        split_dur, "[id_1]", n_merge, randomness, smooth_dur
    )

    r = r / np.abs(r).max()
    sf.write("output.wav", r, samplerate=24000)
    return "output.wav"

custom_css = """
    #custom-box {
        min-height: 300px !important;
        display: flex;
        align-items: center;
    }
    #custom-box textarea {
        min-height: 250px !important;
        height: 100% !important;
    }
"""

# Gradio UI
with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("## StyleTTS2-lite-vi Demo")
    gr.Markdown("Upload a reference audio and input your text to synthesize speech with style control.")

    with gr.Row():
        text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", elem_id="custom-box")
        reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3", ".flac"], file_count="multiple", elem_id="custom-box")
    # Parameters
    with gr.Accordion("Advanced Settings", open=False):
        denoise = gr.Checkbox(label="Apply Denoising", value=True)
        t_denoise = gr.Slider(0.0, 1.0, value=0.3, label="Denoise Strength")
        n_merge = gr.Slider(1, 30, value=16, label="Min Words to Merge")
        randomness = gr.Slider(0.0, 1.0, value=0.2, label="Randomness")
        smooth_dur = gr.Slider(0.0, 1.0, value=0.15, label="Smooth Duration")
        split_dur = gr.Slider(0, 10, step=1, value=3, label="Split Ref Audio Duration")

    submit_button = gr.Button("Synthesize")
    synthesized_audio = gr.Audio(label="Synthesized Audio", type="filepath")

    submit_button.click(
        fn=process_inputs,
        inputs=[
            text_prompt,
            reference_audios,
            n_merge,
            randomness,
            smooth_dur,
            denoise,
            t_denoise,
            split_dur
        ],
        outputs=synthesized_audio
    )

demo.launch()