import streamlit as st import torch from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from gtts import gTTS import numpy as np import sounddevice as sd class VoiceRecognition: def __init__(self): self.processor = Wav2Vec2Processor.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") self.model = Wav2Vec2ForCTC.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") self.sample_rate = 16000 def listen(self): st.write("Escuchando...") audio_data = sd.rec(int(self.sample_rate * 5), samplerate=self.sample_rate, channels=1, dtype='float32') sd.wait() st.write("Grabación terminada.") return audio_data.flatten() def vad(self, audio): threshold = 0.02 return audio[np.abs(audio) > threshold] def transcribe(self, audio): input_values = self.processor(audio, return_tensors="pt", sampling_rate=self.sample_rate).input_values with torch.no_grad(): logits = self.model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) return self.processor.decode(predicted_ids[0]) def text_to_speech(self, text): tts = gTTS(text=text, lang='es') output_path = "response.mp3" tts.save(output_path) return output_path def main(): st.title("Asistente de Voz - Reconocimiento de Voz") recognizer = VoiceRecognition() if st.button("Iniciar Grabación"): audio = recognizer.listen() audio_vad = recognizer.vad(audio) if audio_vad.size > 0: transcription = recognizer.transcribe(audio_vad) st.write(f"Texto transcrito: {transcription}") audio_path = recognizer.text_to_speech(transcription) st.audio(audio_path) else: st.write("No se detectó actividad de voz.") if __name__ == "__main__": main()