xaman7 / app.py
salomonsky's picture
Create app.py
afd9000 verified
raw
history blame
1.91 kB
import streamlit as st
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from gtts import gTTS
import numpy as np
import sounddevice as sd
class VoiceRecognition:
def __init__(self):
self.processor = Wav2Vec2Processor.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
self.model = Wav2Vec2ForCTC.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
self.sample_rate = 16000
def listen(self):
st.write("Escuchando...")
audio_data = sd.rec(int(self.sample_rate * 5), samplerate=self.sample_rate, channels=1, dtype='float32')
sd.wait()
st.write("Grabaci贸n terminada.")
return audio_data.flatten()
def vad(self, audio):
threshold = 0.02
return audio[np.abs(audio) > threshold]
def transcribe(self, audio):
input_values = self.processor(audio, return_tensors="pt", sampling_rate=self.sample_rate).input_values
with torch.no_grad():
logits = self.model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
return self.processor.decode(predicted_ids[0])
def text_to_speech(self, text):
tts = gTTS(text=text, lang='es')
output_path = "response.mp3"
tts.save(output_path)
return output_path
def main():
st.title("Asistente de Voz - Reconocimiento de Voz")
recognizer = VoiceRecognition()
if st.button("Iniciar Grabaci贸n"):
audio = recognizer.listen()
audio_vad = recognizer.vad(audio)
if audio_vad.size > 0:
transcription = recognizer.transcribe(audio_vad)
st.write(f"Texto transcrito: {transcription}")
audio_path = recognizer.text_to_speech(transcription)
st.audio(audio_path)
else:
st.write("No se detect贸 actividad de voz.")
if __name__ == "__main__":
main()