import gradio as gr import subprocess import os import sys import soundfile as sf import numpy as np import torch.cuda repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi" repo_dir = "StyleTTS2-lite-vi" if not os.path.exists(repo_dir): subprocess.run(["git", "clone", repo_url, repo_dir]) # Clone repo and load model sys.path.append(os.path.abspath(repo_dir)) from inference import StyleTTS2 device = 'cuda' if torch.cuda.is_available() else 'cpu' config_path = os.path.join(repo_dir, "Models", "config.yml") models_path = os.path.join(repo_dir, "Models", "model_vi_en.pth") model = StyleTTS2(config_path, models_path).to(device) # Core inference function def process_inputs(text_prompt, reference_audio_paths, n_merge, randomness, smooth_dur, denoise, t_denoise, split_dur): speakers = {} for i, path in enumerate(reference_audio_paths, 1): speaker_id = f"id_{i}" speakers[speaker_id] = { "path": path, "lang": "vi", "speed": 1.1 } # Synthesize audio r = model.generate( text_prompt, speakers, denoise, t_denoise, split_dur, "[id_1]", n_merge, randomness, smooth_dur ) r = r / np.abs(r).max() sf.write("output.wav", r, samplerate=24000) return "output.wav" custom_css = """ #custom-box { min-height: 300px !important; display: flex; align-items: center; } #custom-box textarea { min-height: 250px !important; height: 100% !important; } """ # Gradio UI with gr.Blocks(css=custom_css) as demo: gr.Markdown("## StyleTTS2-lite-vi Demo") gr.Markdown("Upload a reference audio and input your text to synthesize speech with style control.") with gr.Row(): text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", elem_id="custom-box") reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3", ".flac"], file_count="multiple", elem_id="custom-box") # Parameters with gr.Accordion("Advanced Settings", open=False): denoise = gr.Checkbox(label="Apply Denoising", value=True) t_denoise = gr.Slider(0.0, 1.0, value=0.3, label="Denoise Strength") n_merge = gr.Slider(1, 30, value=16, label="Min Words to Merge") randomness = gr.Slider(0.0, 1.0, value=0.2, label="Randomness") smooth_dur = gr.Slider(0.0, 1.0, value=0.15, label="Smooth Duration") split_dur = gr.Slider(0, 10, step=1, value=3, label="Split Ref Audio Duration") submit_button = gr.Button("Synthesize") synthesized_audio = gr.Audio(label="Synthesized Audio", type="filepath") submit_button.click( fn=process_inputs, inputs=[ text_prompt, reference_audios, n_merge, randomness, smooth_dur, denoise, t_denoise, split_dur ], outputs=synthesized_audio ) demo.launch()