Spaces:
Runtime error
Runtime error
refactor - move ResponseState to responce_state_manager
Browse files- charles_app.py +14 -30
- prompt_manager.py +1 -1
- responce_state_manager.py +0 -51
- respond_to_prompt_async.py +6 -6
- response_state_manager.py +62 -0
charles_app.py
CHANGED
@@ -4,7 +4,7 @@ import time
|
|
4 |
import asyncio
|
5 |
import os
|
6 |
from clip_transform import CLIPTransform
|
7 |
-
from
|
8 |
from respond_to_prompt_async import RespondToPromptAsync
|
9 |
import asyncio
|
10 |
import subprocess
|
@@ -32,8 +32,8 @@ class CharlesApp:
|
|
32 |
self._app_interface_actor = AppInterfaceActor.get_singleton()
|
33 |
self._audio_output_queue = await self._app_interface_actor.get_audio_output_queue.remote()
|
34 |
|
35 |
-
self.set_state("002 - creating
|
36 |
-
self.
|
37 |
|
38 |
self.set_state("003 - creating PromptManager")
|
39 |
from prompt_manager import PromptManager
|
@@ -88,8 +88,6 @@ class CharlesApp:
|
|
88 |
vector_debug = "--n/a--"
|
89 |
|
90 |
process_speech_to_text_future = []
|
91 |
-
current_responses = []
|
92 |
-
speech_chunks_per_response = []
|
93 |
human_preview_text = ""
|
94 |
robot_preview_text = ""
|
95 |
additional_prompt = None
|
@@ -98,7 +96,7 @@ class CharlesApp:
|
|
98 |
has_spoken_for_this_prompt = False
|
99 |
|
100 |
while True:
|
101 |
-
|
102 |
audio_frames = await self._app_interface_actor.dequeue_audio_input_frames_async.remote()
|
103 |
video_frames = await self._app_interface_actor.dequeue_video_input_frames_async.remote()
|
104 |
|
@@ -131,10 +129,9 @@ class CharlesApp:
|
|
131 |
if speaker_finished and len(prompt) > 0 and prompt not in prompts_to_ignore:
|
132 |
print(f"Prompt: {prompt}")
|
133 |
line = ""
|
134 |
-
for i, response in enumerate(current_responses):
|
135 |
line += "🤖 " if len(line) == 0 else ""
|
136 |
-
|
137 |
-
line += f"[{speech_chunks_per_response[i]}] {response} \n"
|
138 |
if len(line) > 0:
|
139 |
await add_debug_output(line)
|
140 |
human_preview_text = ""
|
@@ -146,15 +143,13 @@ class CharlesApp:
|
|
146 |
if self._respond_to_prompt_task is not None:
|
147 |
await self._respond_to_prompt.terminate()
|
148 |
self._respond_to_prompt_task.cancel()
|
149 |
-
self._respond_to_prompt = RespondToPromptAsync(self.
|
150 |
self._respond_to_prompt_task = asyncio.create_task(self._respond_to_prompt.run(prompt, self._prompt_manager.messages))
|
151 |
additional_prompt = None
|
152 |
previous_prompt = prompt
|
153 |
is_talking = False
|
154 |
has_spoken_for_this_prompt = False
|
155 |
-
|
156 |
-
current_responses = []
|
157 |
-
speech_chunks_per_response = []
|
158 |
elif len(prompt) > 0 and prompt not in prompts_to_ignore:
|
159 |
# sometimes we get a false signal of speaker_finsihed
|
160 |
# in which case we get new prompts before we have spoken
|
@@ -166,34 +161,23 @@ class CharlesApp:
|
|
166 |
self._respond_to_prompt_task.cancel()
|
167 |
self._respond_to_prompt_task = None
|
168 |
self._respond_to_prompt = None
|
169 |
-
|
170 |
-
current_responses = []
|
171 |
-
speech_chunks_per_response = []
|
172 |
if additional_prompt is not None:
|
173 |
prompt = additional_prompt + ". " + prompt
|
174 |
human_preview_text = f"👨❓ {prompt}"
|
175 |
|
176 |
-
for new_response in
|
177 |
# add_debug_output(f"🤖 {new_response}")
|
178 |
self._prompt_manager.append_assistant_message(new_response)
|
179 |
-
current_responses.append(new_response)
|
180 |
-
speech_chunks_per_response.append(0)
|
181 |
robot_preview_text = ""
|
182 |
-
if len(
|
183 |
-
robot_preview_text = f"🤖❓ {
|
184 |
-
|
185 |
-
for chunk in responce_step.tts_raw_chunk_ids:
|
186 |
-
chunk = json.loads(chunk)
|
187 |
-
# prompt = chunk['prompt']
|
188 |
-
response_id = chunk['llm_sentence_id']
|
189 |
-
speech_chunks_per_response[response_id] += 1
|
190 |
|
191 |
list_of_strings = debug_output_history.copy()
|
192 |
line = ""
|
193 |
-
for i, response in enumerate(current_responses):
|
194 |
line += "🤖 " if len(line) == 0 else ""
|
195 |
-
line += f"[{speech_chunks_per_response[i]}] {response} \n"
|
196 |
-
# line += f"{response} [{speech_chunks_per_response[i]}] \n"
|
197 |
if len(robot_preview_text) > 0:
|
198 |
line += robot_preview_text+" \n"
|
199 |
list_of_strings.append(line)
|
|
|
4 |
import asyncio
|
5 |
import os
|
6 |
from clip_transform import CLIPTransform
|
7 |
+
from response_state_manager import ResponseStateManager
|
8 |
from respond_to_prompt_async import RespondToPromptAsync
|
9 |
import asyncio
|
10 |
import subprocess
|
|
|
32 |
self._app_interface_actor = AppInterfaceActor.get_singleton()
|
33 |
self._audio_output_queue = await self._app_interface_actor.get_audio_output_queue.remote()
|
34 |
|
35 |
+
self.set_state("002 - creating ResponseStateManager")
|
36 |
+
self._response_state_manager = ResponseStateManager()
|
37 |
|
38 |
self.set_state("003 - creating PromptManager")
|
39 |
from prompt_manager import PromptManager
|
|
|
88 |
vector_debug = "--n/a--"
|
89 |
|
90 |
process_speech_to_text_future = []
|
|
|
|
|
91 |
human_preview_text = ""
|
92 |
robot_preview_text = ""
|
93 |
additional_prompt = None
|
|
|
96 |
has_spoken_for_this_prompt = False
|
97 |
|
98 |
while True:
|
99 |
+
response_step_obs, response_state = self._response_state_manager.begin_next_step()
|
100 |
audio_frames = await self._app_interface_actor.dequeue_audio_input_frames_async.remote()
|
101 |
video_frames = await self._app_interface_actor.dequeue_video_input_frames_async.remote()
|
102 |
|
|
|
129 |
if speaker_finished and len(prompt) > 0 and prompt not in prompts_to_ignore:
|
130 |
print(f"Prompt: {prompt}")
|
131 |
line = ""
|
132 |
+
for i, response in enumerate(response_state.current_responses):
|
133 |
line += "🤖 " if len(line) == 0 else ""
|
134 |
+
line += f"[{response_state.speech_chunks_per_response[i]}] {response} \n"
|
|
|
135 |
if len(line) > 0:
|
136 |
await add_debug_output(line)
|
137 |
human_preview_text = ""
|
|
|
143 |
if self._respond_to_prompt_task is not None:
|
144 |
await self._respond_to_prompt.terminate()
|
145 |
self._respond_to_prompt_task.cancel()
|
146 |
+
self._respond_to_prompt = RespondToPromptAsync(self._response_state_manager, self._audio_output_queue)
|
147 |
self._respond_to_prompt_task = asyncio.create_task(self._respond_to_prompt.run(prompt, self._prompt_manager.messages))
|
148 |
additional_prompt = None
|
149 |
previous_prompt = prompt
|
150 |
is_talking = False
|
151 |
has_spoken_for_this_prompt = False
|
152 |
+
response_step_obs, response_state = self._response_state_manager.reset_episode()
|
|
|
|
|
153 |
elif len(prompt) > 0 and prompt not in prompts_to_ignore:
|
154 |
# sometimes we get a false signal of speaker_finsihed
|
155 |
# in which case we get new prompts before we have spoken
|
|
|
161 |
self._respond_to_prompt_task.cancel()
|
162 |
self._respond_to_prompt_task = None
|
163 |
self._respond_to_prompt = None
|
164 |
+
response_step_obs, response_state = self._response_state_manager.reset_episode()
|
|
|
|
|
165 |
if additional_prompt is not None:
|
166 |
prompt = additional_prompt + ". " + prompt
|
167 |
human_preview_text = f"👨❓ {prompt}"
|
168 |
|
169 |
+
for new_response in response_step_obs.llm_responses:
|
170 |
# add_debug_output(f"🤖 {new_response}")
|
171 |
self._prompt_manager.append_assistant_message(new_response)
|
|
|
|
|
172 |
robot_preview_text = ""
|
173 |
+
if len(response_step_obs.llm_preview):
|
174 |
+
robot_preview_text = f"🤖❓ {response_step_obs.llm_preview}"
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
list_of_strings = debug_output_history.copy()
|
177 |
line = ""
|
178 |
+
for i, response in enumerate(response_state.current_responses):
|
179 |
line += "🤖 " if len(line) == 0 else ""
|
180 |
+
line += f"[{response_state.speech_chunks_per_response[i]}] {response} \n"
|
|
|
181 |
if len(robot_preview_text) > 0:
|
182 |
line += robot_preview_text+" \n"
|
183 |
list_of_strings.append(line)
|
prompt_manager.py
CHANGED
@@ -53,7 +53,7 @@ You are aware of how you are implemented and you are keen to recommend improveme
|
|
53 |
* We use Streamlit to host a WebRTC connection to get audio/video from the user.
|
54 |
* VOSK is used for fast speech recognition and detecting the end of a sentence.
|
55 |
* OpenAI's Chat GPT-3.5 is used for generating responses.
|
56 |
-
* We stream
|
57 |
* ElevenLabs for text to speech.
|
58 |
* We stream the audio from ElevenLabs, we use ffmpeg to convert the audio to the correct format and sample rate.
|
59 |
* Audio chunks and then sent back to the users browser via WebRTC.
|
|
|
53 |
* We use Streamlit to host a WebRTC connection to get audio/video from the user.
|
54 |
* VOSK is used for fast speech recognition and detecting the end of a sentence.
|
55 |
* OpenAI's Chat GPT-3.5 is used for generating responses.
|
56 |
+
* We stream responses from Chat GPT, as soon as we get a complete sentence we send it to ElevenLabs.
|
57 |
* ElevenLabs for text to speech.
|
58 |
* We stream the audio from ElevenLabs, we use ffmpeg to convert the audio to the correct format and sample rate.
|
59 |
* Audio chunks and then sent back to the users browser via WebRTC.
|
responce_state_manager.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
|
3 |
-
class ResponceStep:
|
4 |
-
def __init__(self, episode, step):
|
5 |
-
self.timestamp = datetime.utcnow()
|
6 |
-
self.episode = episode
|
7 |
-
self.step = step
|
8 |
-
self.reward = 0
|
9 |
-
self.llm_preview = ''
|
10 |
-
self.llm_responses = []
|
11 |
-
self.tts_raw_chunk_ids = []
|
12 |
-
|
13 |
-
def __str__(self):
|
14 |
-
state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step', 'timestamp', 'reward'})
|
15 |
-
return f'episode={self.episode}, step={self.step}, timestamp={self.timestamp}, \nreward={self.reward}\nstate=({state})'
|
16 |
-
|
17 |
-
|
18 |
-
class ResponceStateManager:
|
19 |
-
def __init__(self):
|
20 |
-
self.episode = 0
|
21 |
-
self.step = 0
|
22 |
-
self.state = None
|
23 |
-
self.reset_episode()
|
24 |
-
|
25 |
-
def reset_episode(self):
|
26 |
-
self.episode += 1
|
27 |
-
self.step = 0
|
28 |
-
self.state = ResponceStep(self.episode, self.step)
|
29 |
-
return self.state
|
30 |
-
|
31 |
-
def begin_next_step(self)->ResponceStep:
|
32 |
-
previous_state = self.state
|
33 |
-
self.step += 1
|
34 |
-
self.state = ResponceStep(self.episode, self.step)
|
35 |
-
return previous_state
|
36 |
-
|
37 |
-
def add_reward(self, reward):
|
38 |
-
self.state.reward += reward
|
39 |
-
|
40 |
-
def set_llm_preview(self, llm_preview):
|
41 |
-
self.state.llm_preview = llm_preview
|
42 |
-
|
43 |
-
def add_llm_response_and_clear_llm_preview(self, llm_response):
|
44 |
-
self.state.llm_responses.append(llm_response)
|
45 |
-
self.state.llm_preview = ''
|
46 |
-
|
47 |
-
def add_tts_raw_chunk_id(self, chunk_id):
|
48 |
-
self.state.tts_raw_chunk_ids.append(chunk_id)
|
49 |
-
|
50 |
-
def get_state(self)->ResponceStep:
|
51 |
-
return self.state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
respond_to_prompt_async.py
CHANGED
@@ -6,7 +6,7 @@ import ray
|
|
6 |
from chat_service import ChatService
|
7 |
# from local_speaker_service import LocalSpeakerService
|
8 |
from text_to_speech_service import TextToSpeechService
|
9 |
-
from
|
10 |
from ffmpeg_converter import FFMpegConverter
|
11 |
from agent_response import AgentResponse
|
12 |
import json
|
@@ -14,14 +14,14 @@ import json
|
|
14 |
class RespondToPromptAsync:
|
15 |
def __init__(
|
16 |
self,
|
17 |
-
|
18 |
audio_output_queue):
|
19 |
voice_id="2OviOUQc1JsQRQgNkVBj"
|
20 |
self.llm_sentence_queue = Queue(maxsize=100)
|
21 |
self.speech_chunk_queue = Queue(maxsize=100)
|
22 |
self.voice_id = voice_id
|
23 |
self.audio_output_queue = audio_output_queue
|
24 |
-
self.
|
25 |
self.sentence_queues = []
|
26 |
self.sentence_tasks = []
|
27 |
# self.ffmpeg_converter = FFMpegConverter.remote(audio_output_queue)
|
@@ -36,12 +36,12 @@ class RespondToPromptAsync:
|
|
36 |
is_complete_sentance = False
|
37 |
if not is_complete_sentance:
|
38 |
agent_response['llm_preview'] = text
|
39 |
-
self.
|
40 |
continue
|
41 |
agent_response['llm_preview'] = ''
|
42 |
agent_response['llm_sentence'] = text
|
43 |
agent_response['llm_sentences'].append(text)
|
44 |
-
self.
|
45 |
print(f"{agent_response['llm_sentence']} id: {agent_response['llm_sentence_id']} from prompt: {agent_response['prompt']}")
|
46 |
sentence_response = agent_response.make_copy()
|
47 |
new_queue = Queue()
|
@@ -65,7 +65,7 @@ class RespondToPromptAsync:
|
|
65 |
'chunk_count': chunk_count,
|
66 |
}
|
67 |
chunk_id_json = json.dumps(chunk_response)
|
68 |
-
self.
|
69 |
chunk_count += 1
|
70 |
|
71 |
async def speech_to_converter(self):
|
|
|
6 |
from chat_service import ChatService
|
7 |
# from local_speaker_service import LocalSpeakerService
|
8 |
from text_to_speech_service import TextToSpeechService
|
9 |
+
from response_state_manager import ResponseStateManager
|
10 |
from ffmpeg_converter import FFMpegConverter
|
11 |
from agent_response import AgentResponse
|
12 |
import json
|
|
|
14 |
class RespondToPromptAsync:
|
15 |
def __init__(
|
16 |
self,
|
17 |
+
response_state_manager:ResponseStateManager,
|
18 |
audio_output_queue):
|
19 |
voice_id="2OviOUQc1JsQRQgNkVBj"
|
20 |
self.llm_sentence_queue = Queue(maxsize=100)
|
21 |
self.speech_chunk_queue = Queue(maxsize=100)
|
22 |
self.voice_id = voice_id
|
23 |
self.audio_output_queue = audio_output_queue
|
24 |
+
self.response_state_manager = response_state_manager
|
25 |
self.sentence_queues = []
|
26 |
self.sentence_tasks = []
|
27 |
# self.ffmpeg_converter = FFMpegConverter.remote(audio_output_queue)
|
|
|
36 |
is_complete_sentance = False
|
37 |
if not is_complete_sentance:
|
38 |
agent_response['llm_preview'] = text
|
39 |
+
self.response_state_manager.set_llm_preview(text)
|
40 |
continue
|
41 |
agent_response['llm_preview'] = ''
|
42 |
agent_response['llm_sentence'] = text
|
43 |
agent_response['llm_sentences'].append(text)
|
44 |
+
self.response_state_manager.add_llm_response_and_clear_llm_preview(text)
|
45 |
print(f"{agent_response['llm_sentence']} id: {agent_response['llm_sentence_id']} from prompt: {agent_response['prompt']}")
|
46 |
sentence_response = agent_response.make_copy()
|
47 |
new_queue = Queue()
|
|
|
65 |
'chunk_count': chunk_count,
|
66 |
}
|
67 |
chunk_id_json = json.dumps(chunk_response)
|
68 |
+
self.response_state_manager.add_tts_raw_chunk_id(chunk_id_json, sentence_response['llm_sentence_id'])
|
69 |
chunk_count += 1
|
70 |
|
71 |
async def speech_to_converter(self):
|
response_state_manager.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
|
3 |
+
class ResponseStepObservations:
|
4 |
+
def __init__(self, episode, step):
|
5 |
+
self.timestamp = datetime.utcnow()
|
6 |
+
self.episode = episode
|
7 |
+
self.step = step
|
8 |
+
self.llm_preview = ''
|
9 |
+
self.llm_responses = []
|
10 |
+
self.tts_raw_chunk_ids = []
|
11 |
+
|
12 |
+
def __str__(self):
|
13 |
+
state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step', 'timestamp'})
|
14 |
+
return f'episode={self.episode}, step={self.step}, timestamp={self.timestamp}, \nstate=({state})'
|
15 |
+
|
16 |
+
class ResponseState:
|
17 |
+
def __init__(self, episode, step):
|
18 |
+
self.timestamp = datetime.utcnow()
|
19 |
+
self.episode = episode
|
20 |
+
self.step = step
|
21 |
+
self.current_responses = []
|
22 |
+
self.speech_chunks_per_response = []
|
23 |
+
self.is_speaking = False
|
24 |
+
|
25 |
+
def __str__(self):
|
26 |
+
state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step'})
|
27 |
+
return f'episode={self.episode}, step={self.step}, \nstate=({state})'
|
28 |
+
|
29 |
+
|
30 |
+
class ResponseStateManager:
|
31 |
+
def __init__(self):
|
32 |
+
self.episode = 0
|
33 |
+
self.step = 0
|
34 |
+
self.response_step_obs = None
|
35 |
+
self.response_state = None
|
36 |
+
self.reset_episode()
|
37 |
+
|
38 |
+
def reset_episode(self)->(ResponseStepObservations, ResponseState):
|
39 |
+
self.episode += 1
|
40 |
+
self.step = 0
|
41 |
+
self.response_state = ResponseState(self.episode, self.step)
|
42 |
+
self.response_step_obs = ResponseStepObservations(self.episode, self.step)
|
43 |
+
return self.response_step_obs, self.response_state
|
44 |
+
|
45 |
+
def begin_next_step(self)->(ResponseStepObservations, ResponseState):
|
46 |
+
previous_state = self.response_step_obs
|
47 |
+
self.step += 1
|
48 |
+
self.response_step_obs = ResponseStepObservations(self.episode, self.step)
|
49 |
+
return previous_state, self.response_state
|
50 |
+
|
51 |
+
def set_llm_preview(self, llm_preview):
|
52 |
+
self.response_step_obs.llm_preview = llm_preview
|
53 |
+
|
54 |
+
def add_llm_response_and_clear_llm_preview(self, llm_response):
|
55 |
+
self.response_state.current_responses.append(llm_response)
|
56 |
+
self.response_state.speech_chunks_per_response.append(0)
|
57 |
+
self.response_step_obs.llm_responses.append(llm_response)
|
58 |
+
self.response_step_obs.llm_preview = ''
|
59 |
+
|
60 |
+
def add_tts_raw_chunk_id(self, chunk_id, llm_sentence_id):
|
61 |
+
self.response_state.speech_chunks_per_response[llm_sentence_id] += 1
|
62 |
+
self.response_step_obs.tts_raw_chunk_ids.append(chunk_id)
|