VoiceBot

Sleeping

App Files Files Community

Chris4K commited on Feb 4

Commit

ad50c97

verified ·

1 Parent(s): 72827ce

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -41

app.py CHANGED Viewed

@@ -75,10 +75,33 @@ async def process_audio_stream(websocket: WebSocket) -> AsyncGenerator[str, None
     while True:
         try:
-            audio_data = await websocket.receive_bytes()
-            # Convert audio data to the right format for VAD
-            is_speech = vad.is_speech(audio_data, SAMPLE_RATE)
             if is_speech:
                 silence_frames = 0
@@ -88,46 +111,52 @@ async def process_audio_stream(websocket: WebSocket) -> AsyncGenerator[str, None
                 silence_frames += 1
                 if silence_frames > 30:  # End of utterance detection
                     # Process complete utterance
-                    audio_bytes = b''.join(buffer)
-                    # Convert to wave file for speech recognition
-                    wav_buffer = io.BytesIO()
-                    with wave.open(wav_buffer, 'wb') as wav_file:
-                        wav_file.setnchannels(CHANNELS)
-                        wav_file.setsampwidth(2)  # 16-bit audio
-                        wav_file.setframerate(SAMPLE_RATE)
-                        wav_file.writeframes(audio_bytes)
-                    # Reset state
-                    buffer = []
-                    is_speaking = False
-                    silence_frames = 0
-                    # Check for wake word
-                    if await detect_wakeword(audio_bytes):
-                        # Process the audio and get response
-                        user_speech_text = stt(wav_buffer, desired_language)
-                        if "computer" in user_speech_text.lower():
-                            translated_text = to_en_translation(user_speech_text, desired_language)
-                            response = await agent.arun(translated_text)  # Assuming agent.run is made async
-                            bot_response_de = from_en_translation(response, desired_language)
-                            # Stream the response
-                            yield json.dumps({
-                                "user_text": user_speech_text,
-                                "response_de": bot_response_de,
-                                "response_en": response
-                            })
-                            # Generate and stream audio response
-                            bot_voice = tts(bot_response_de, desired_language)
-                            bot_voice_bytes = tts_to_bytesio(bot_voice)
-                            yield json.dumps({
-                                "audio": bot_voice_bytes.decode('latin1')
-                            })
         except Exception as e:
-            print(f"Error processing audio: {e}")
             break
 @app.get("/", response_class=HTMLResponse)

     while True:
         try:
+            # Add a timeout to prevent indefinite waiting
+            try:
+                audio_data = await asyncio.wait_for(websocket.receive_bytes(), timeout=5.0)
+            except asyncio.TimeoutError:
+                print("WebSocket receive timeout")
+                continue
+            except Exception as receive_error:
+                print(f"Error receiving audio data: {receive_error}")
+                # Break the loop if there's a persistent receive error
+                break
+            # Validate audio data
+            if not audio_data or len(audio_data) == 0:
+                print("Received empty audio data")
+                continue
+            # Ensure audio data meets minimum size for VAD processing
+            if len(audio_data) < CHUNK_SIZE:
+                print(f"Audio chunk too small: {len(audio_data)} bytes")
+                continue
+            try:
+                # Convert audio data to the right format for VAD
+                is_speech = vad.is_speech(audio_data, SAMPLE_RATE)
+            except Exception as vad_error:
+                print(f"VAD processing error: {vad_error}")
+                continue
             if is_speech:
                 silence_frames = 0
                 silence_frames += 1
                 if silence_frames > 30:  # End of utterance detection
                     # Process complete utterance
+                    try:
+                        audio_bytes = b''.join(buffer)
+                        # Convert to wave file for speech recognition
+                        wav_buffer = io.BytesIO()
+                        with wave.open(wav_buffer, 'wb') as wav_file:
+                            wav_file.setnchannels(CHANNELS)
+                            wav_file.setsampwidth(2)  # 16-bit audio
+                            wav_file.setframerate(SAMPLE_RATE)
+                            wav_file.writeframes(audio_bytes)
+                        # Reset state
+                        buffer = []
+                        is_speaking = False
+                        silence_frames = 0
+                        # Check for wake word
+                        if await detect_wakeword(audio_bytes):
+                            # Process the audio and get response
+                            user_speech_text = stt(wav_buffer, desired_language)
+                            if "computer" in user_speech_text.lower():
+                                translated_text = to_en_translation(user_speech_text, desired_language)
+                                response = await agent.arun(translated_text)  # Assuming agent.run is made async
+                                bot_response_de = from_en_translation(response, desired_language)
+                                # Stream the response
+                                yield json.dumps({
+                                    "user_text": user_speech_text,
+                                    "response_de": bot_response_de,
+                                    "response_en": response
+                                })
+                                # Generate and stream audio response
+                                bot_voice = tts(bot_response_de, desired_language)
+                                bot_voice_bytes = tts_to_bytesio(bot_voice)
+                                yield json.dumps({
+                                    "audio": bot_voice_bytes.decode('latin1')
+                                })
+                    except Exception as processing_error:
+                        print(f"Error processing speech utterance: {processing_error}")
         except Exception as e:
+            print(f"Unexpected error in audio stream processing: {e}")
+            # Add a small delay to prevent rapid reconnection attempts
+            await asyncio.sleep(1)
             break
 @app.get("/", response_class=HTMLResponse)