import torch import gradio as gr import soundfile as sf import tempfile from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset # Carga de modelos y recursos processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Carga de embeddings de ejemplo (puedes permitir que el usuario elija un speaker también si quieres) embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Función principal para generar voz def tts(text): inputs = processor(text=text, return_tensors="pt") with torch.no_grad(): speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Guardar a archivo temporal with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, speech.numpy(), samplerate=16000) return f.name # Interfaz con Gradio interface = gr.Interface( fn=tts, inputs=gr.Textbox(lines=2, placeholder="Introduce un texto..."), outputs=gr.Audio(type="filepath"), title="SpeechT5 TTS - Hugging Face Space", description="Convierte texto a voz con el modelo SpeechT5 de Microsoft" ) interface.launch()