Chris4K commited on
Commit
ad50c97
·
verified ·
1 Parent(s): 72827ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -41
app.py CHANGED
@@ -75,10 +75,33 @@ async def process_audio_stream(websocket: WebSocket) -> AsyncGenerator[str, None
75
 
76
  while True:
77
  try:
78
- audio_data = await websocket.receive_bytes()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Convert audio data to the right format for VAD
81
- is_speech = vad.is_speech(audio_data, SAMPLE_RATE)
 
 
 
 
 
 
 
 
 
82
 
83
  if is_speech:
84
  silence_frames = 0
@@ -88,46 +111,52 @@ async def process_audio_stream(websocket: WebSocket) -> AsyncGenerator[str, None
88
  silence_frames += 1
89
  if silence_frames > 30: # End of utterance detection
90
  # Process complete utterance
91
- audio_bytes = b''.join(buffer)
92
-
93
- # Convert to wave file for speech recognition
94
- wav_buffer = io.BytesIO()
95
- with wave.open(wav_buffer, 'wb') as wav_file:
96
- wav_file.setnchannels(CHANNELS)
97
- wav_file.setsampwidth(2) # 16-bit audio
98
- wav_file.setframerate(SAMPLE_RATE)
99
- wav_file.writeframes(audio_bytes)
100
-
101
- # Reset state
102
- buffer = []
103
- is_speaking = False
104
- silence_frames = 0
105
-
106
- # Check for wake word
107
- if await detect_wakeword(audio_bytes):
108
- # Process the audio and get response
109
- user_speech_text = stt(wav_buffer, desired_language)
110
- if "computer" in user_speech_text.lower():
111
- translated_text = to_en_translation(user_speech_text, desired_language)
112
- response = await agent.arun(translated_text) # Assuming agent.run is made async
113
- bot_response_de = from_en_translation(response, desired_language)
114
-
115
- # Stream the response
116
- yield json.dumps({
117
- "user_text": user_speech_text,
118
- "response_de": bot_response_de,
119
- "response_en": response
120
- })
121
-
122
- # Generate and stream audio response
123
- bot_voice = tts(bot_response_de, desired_language)
124
- bot_voice_bytes = tts_to_bytesio(bot_voice)
125
- yield json.dumps({
126
- "audio": bot_voice_bytes.decode('latin1')
127
- })
 
128
 
 
 
 
129
  except Exception as e:
130
- print(f"Error processing audio: {e}")
 
 
131
  break
132
 
133
  @app.get("/", response_class=HTMLResponse)
 
75
 
76
  while True:
77
  try:
78
+ # Add a timeout to prevent indefinite waiting
79
+ try:
80
+ audio_data = await asyncio.wait_for(websocket.receive_bytes(), timeout=5.0)
81
+ except asyncio.TimeoutError:
82
+ print("WebSocket receive timeout")
83
+ continue
84
+ except Exception as receive_error:
85
+ print(f"Error receiving audio data: {receive_error}")
86
+ # Break the loop if there's a persistent receive error
87
+ break
88
+
89
+ # Validate audio data
90
+ if not audio_data or len(audio_data) == 0:
91
+ print("Received empty audio data")
92
+ continue
93
 
94
+ # Ensure audio data meets minimum size for VAD processing
95
+ if len(audio_data) < CHUNK_SIZE:
96
+ print(f"Audio chunk too small: {len(audio_data)} bytes")
97
+ continue
98
+
99
+ try:
100
+ # Convert audio data to the right format for VAD
101
+ is_speech = vad.is_speech(audio_data, SAMPLE_RATE)
102
+ except Exception as vad_error:
103
+ print(f"VAD processing error: {vad_error}")
104
+ continue
105
 
106
  if is_speech:
107
  silence_frames = 0
 
111
  silence_frames += 1
112
  if silence_frames > 30: # End of utterance detection
113
  # Process complete utterance
114
+ try:
115
+ audio_bytes = b''.join(buffer)
116
+
117
+ # Convert to wave file for speech recognition
118
+ wav_buffer = io.BytesIO()
119
+ with wave.open(wav_buffer, 'wb') as wav_file:
120
+ wav_file.setnchannels(CHANNELS)
121
+ wav_file.setsampwidth(2) # 16-bit audio
122
+ wav_file.setframerate(SAMPLE_RATE)
123
+ wav_file.writeframes(audio_bytes)
124
+
125
+ # Reset state
126
+ buffer = []
127
+ is_speaking = False
128
+ silence_frames = 0
129
+
130
+ # Check for wake word
131
+ if await detect_wakeword(audio_bytes):
132
+ # Process the audio and get response
133
+ user_speech_text = stt(wav_buffer, desired_language)
134
+ if "computer" in user_speech_text.lower():
135
+ translated_text = to_en_translation(user_speech_text, desired_language)
136
+ response = await agent.arun(translated_text) # Assuming agent.run is made async
137
+ bot_response_de = from_en_translation(response, desired_language)
138
+
139
+ # Stream the response
140
+ yield json.dumps({
141
+ "user_text": user_speech_text,
142
+ "response_de": bot_response_de,
143
+ "response_en": response
144
+ })
145
+
146
+ # Generate and stream audio response
147
+ bot_voice = tts(bot_response_de, desired_language)
148
+ bot_voice_bytes = tts_to_bytesio(bot_voice)
149
+ yield json.dumps({
150
+ "audio": bot_voice_bytes.decode('latin1')
151
+ })
152
 
153
+ except Exception as processing_error:
154
+ print(f"Error processing speech utterance: {processing_error}")
155
+
156
  except Exception as e:
157
+ print(f"Unexpected error in audio stream processing: {e}")
158
+ # Add a small delay to prevent rapid reconnection attempts
159
+ await asyncio.sleep(1)
160
  break
161
 
162
  @app.get("/", response_class=HTMLResponse)