import torch, gradio as gr, soundfile as sf, tempfile from transformers import VitsModel, AutoProcessor DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODELS = { "Male": "vlntlbr/vits-greek-male", "Female": "vlntlbr/vits-greek-female", } tts = {k: VitsModel.from_pretrained(r).to(DEVICE) for k, r in MODELS.items()} proc = {k: AutoProcessor.from_pretrained(r) for k, r in MODELS.items()} SNIPPETS = { "Male": "audio/male_ref.mp3", "Female": "audio/female_ref.mp3", } def synth(text, speaker): inputs = proc[speaker](text, return_tensors="pt").to(DEVICE) with torch.no_grad(): wav = tts[speaker](**inputs).waveform.squeeze().cpu() tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") sf.write(tmp.name, wav.numpy(), 16000) return tmp.name # Add examples (Greek phrases with their respective speakers) examples = [ ["Πες μου πώς σε λένε.", "Male"], ["Τι μας προτείνεις;", "Female"], ["Είναι τόσο όμορφα έξω!", "Male"], ["Η οικογένεια είναι μαζεμένη στην τραπεζαρία", "Female"] ] custom_css = """ h1, .gr-markdown h1 { text-align: center; } .gr-markdown p { text-align: center; } """ with gr.Blocks(title="Greek TTS (male / female)", css=custom_css) as demo: gr.Markdown( "# Greek TTS Demo\n" "Choose a speaker, listen to a reference clip, then enter Greek text." ) with gr.Row(): speaker = gr.Radio(["Male", "Female"], value="Male", label="Speaker") ref_aud = gr.Audio(SNIPPETS["Male"], interactive=False, label="Reference") speaker.change(lambda s: gr.update(value=SNIPPETS[s]), speaker, ref_aud) text_in = gr.Textbox(label="Greek text", placeholder="Γράψε κάτι…") gr.Examples(examples=examples, inputs=[text_in, speaker]) out_aud = gr.Audio(label="Synthesised speech") gr.Button("Generate!").click(synth, [text_in, speaker], out_aud) demo.launch()