seayala's picture
Create app.py
3582a8e verified
import torch
import gradio as gr
import soundfile as sf
import tempfile
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# Carga de modelos y recursos
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Carga de embeddings de ejemplo (puedes permitir que el usuario elija un speaker también si quieres)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Función principal para generar voz
def tts(text):
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Guardar a archivo temporal
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, speech.numpy(), samplerate=16000)
return f.name
# Interfaz con Gradio
interface = gr.Interface(
fn=tts,
inputs=gr.Textbox(lines=2, placeholder="Introduce un texto..."),
outputs=gr.Audio(type="filepath"),
title="SpeechT5 TTS - Hugging Face Space",
description="Convierte texto a voz con el modelo SpeechT5 de Microsoft"
)
interface.launch()