dangtr0408's picture
init space
5b935e1 verified
raw
history blame
3.12 kB
import gradio as gr
import subprocess
import os
import sys
import soundfile as sf
import numpy as np
import torch.cuda
repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
repo_dir = "StyleTTS2-lite-vi"
if not os.path.exists(repo_dir):
subprocess.run(["git", "clone", repo_url, repo_dir])
# Clone repo and load model
sys.path.append(os.path.abspath(repo_dir))
from inference import StyleTTS2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config_path = os.path.join(repo_dir, "Models", "config.yml")
models_path = os.path.join(repo_dir, "Models", "model_vi_en.pth")
model = StyleTTS2(config_path, models_path).to(device)
# Core inference function
def process_inputs(text_prompt, reference_audio_paths,
n_merge, randomness, smooth_dur,
denoise, t_denoise, split_dur):
speakers = {}
for i, path in enumerate(reference_audio_paths, 1):
speaker_id = f"id_{i}"
speakers[speaker_id] = {
"path": path,
"lang": "vi",
"speed": 1.1
}
# Synthesize audio
r = model.generate(
text_prompt, speakers, denoise, t_denoise,
split_dur, "[id_1]", n_merge, randomness, smooth_dur
)
r = r / np.abs(r).max()
sf.write("output.wav", r, samplerate=24000)
return "output.wav"
custom_css = """
#custom-box {
min-height: 300px !important;
display: flex;
align-items: center;
}
#custom-box textarea {
min-height: 250px !important;
height: 100% !important;
}
"""
# Gradio UI
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("## StyleTTS2-lite-vi Demo")
gr.Markdown("Upload a reference audio and input your text to synthesize speech with style control.")
with gr.Row():
text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", elem_id="custom-box")
reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3", ".flac"], file_count="multiple", elem_id="custom-box")
# Parameters
with gr.Accordion("Advanced Settings", open=False):
denoise = gr.Checkbox(label="Apply Denoising", value=True)
t_denoise = gr.Slider(0.0, 1.0, value=0.3, label="Denoise Strength")
n_merge = gr.Slider(1, 30, value=16, label="Min Words to Merge")
randomness = gr.Slider(0.0, 1.0, value=0.2, label="Randomness")
smooth_dur = gr.Slider(0.0, 1.0, value=0.15, label="Smooth Duration")
split_dur = gr.Slider(0, 10, step=1, value=3, label="Split Ref Audio Duration")
submit_button = gr.Button("Synthesize")
synthesized_audio = gr.Audio(label="Synthesized Audio", type="filepath")
submit_button.click(
fn=process_inputs,
inputs=[
text_prompt,
reference_audios,
n_merge,
randomness,
smooth_dur,
denoise,
t_denoise,
split_dur
],
outputs=synthesized_audio
)
demo.launch()