Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
4 |
+
from gtts import gTTS
|
5 |
+
import numpy as np
|
6 |
+
import sounddevice as sd
|
7 |
+
|
8 |
+
class VoiceRecognition:
|
9 |
+
def __init__(self):
|
10 |
+
self.processor = Wav2Vec2Processor.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
11 |
+
self.model = Wav2Vec2ForCTC.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
12 |
+
self.sample_rate = 16000
|
13 |
+
|
14 |
+
def listen(self):
|
15 |
+
st.write("Escuchando...")
|
16 |
+
audio_data = sd.rec(int(self.sample_rate * 5), samplerate=self.sample_rate, channels=1, dtype='float32')
|
17 |
+
sd.wait()
|
18 |
+
st.write("Grabaci贸n terminada.")
|
19 |
+
return audio_data.flatten()
|
20 |
+
|
21 |
+
def vad(self, audio):
|
22 |
+
threshold = 0.02
|
23 |
+
return audio[np.abs(audio) > threshold]
|
24 |
+
|
25 |
+
def transcribe(self, audio):
|
26 |
+
input_values = self.processor(audio, return_tensors="pt", sampling_rate=self.sample_rate).input_values
|
27 |
+
with torch.no_grad():
|
28 |
+
logits = self.model(input_values).logits
|
29 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
30 |
+
return self.processor.decode(predicted_ids[0])
|
31 |
+
|
32 |
+
def text_to_speech(self, text):
|
33 |
+
tts = gTTS(text=text, lang='es')
|
34 |
+
output_path = "response.mp3"
|
35 |
+
tts.save(output_path)
|
36 |
+
return output_path
|
37 |
+
|
38 |
+
def main():
|
39 |
+
st.title("Asistente de Voz - Reconocimiento de Voz")
|
40 |
+
recognizer = VoiceRecognition()
|
41 |
+
|
42 |
+
if st.button("Iniciar Grabaci贸n"):
|
43 |
+
audio = recognizer.listen()
|
44 |
+
audio_vad = recognizer.vad(audio)
|
45 |
+
|
46 |
+
if audio_vad.size > 0:
|
47 |
+
transcription = recognizer.transcribe(audio_vad)
|
48 |
+
st.write(f"Texto transcrito: {transcription}")
|
49 |
+
audio_path = recognizer.text_to_speech(transcription)
|
50 |
+
st.audio(audio_path)
|
51 |
+
else:
|
52 |
+
st.write("No se detect贸 actividad de voz.")
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
main()
|