Commit
·
6af5041
1
Parent(s):
168536f
add some comments and remove unnecessary comemnts
Browse files
app.py
CHANGED
|
@@ -120,6 +120,7 @@ text_client = InferenceClient(
|
|
| 120 |
|
| 121 |
|
| 122 |
###### COQUI TTS FUNCTIONS ######
|
|
|
|
| 123 |
def get_latents(speaker_wav):
|
| 124 |
# create as function as we can populate here with voice cleanup/filtering
|
| 125 |
(
|
|
@@ -129,7 +130,88 @@ def get_latents(speaker_wav):
|
|
| 129 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 130 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
def format_prompt(message, history):
|
| 134 |
prompt = (
|
| 135 |
"<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
|
|
@@ -140,7 +222,6 @@ def format_prompt(message, history):
|
|
| 140 |
prompt += f"[INST] {message} [/INST]"
|
| 141 |
return prompt
|
| 142 |
|
| 143 |
-
|
| 144 |
def generate(
|
| 145 |
prompt,
|
| 146 |
history,
|
|
@@ -197,6 +278,8 @@ def generate(
|
|
| 197 |
return output
|
| 198 |
|
| 199 |
|
|
|
|
|
|
|
| 200 |
def transcribe(wav_path):
|
| 201 |
try:
|
| 202 |
# get result from whisper and strip it to delete begin and end space
|
|
@@ -212,13 +295,13 @@ def transcribe(wav_path):
|
|
| 212 |
|
| 213 |
# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
|
| 214 |
|
| 215 |
-
|
| 216 |
def add_text(history, text):
|
| 217 |
history = [] if history is None else history
|
| 218 |
history = history + [(text, None)]
|
| 219 |
return history, gr.update(value="", interactive=False)
|
| 220 |
|
| 221 |
-
|
| 222 |
def add_file(history, file):
|
| 223 |
history = [] if history is None else history
|
| 224 |
|
|
@@ -247,90 +330,8 @@ def bot(history, system_prompt=""):
|
|
| 247 |
history[-1][1] = character
|
| 248 |
yield history
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
# Generate speaker embedding and latents for TTS
|
| 253 |
-
(
|
| 254 |
-
gpt_cond_latent,
|
| 255 |
-
diffusion_conditioning,
|
| 256 |
-
speaker_embedding,
|
| 257 |
-
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 258 |
-
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
latent_map = {}
|
| 262 |
-
latent_map["Female_Voice"] = get_latents("examples/female.wav")
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
| 267 |
-
# This will create a wave header then append the frame input
|
| 268 |
-
# It should be first on a streaming wav file
|
| 269 |
-
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
| 270 |
-
wav_buf = io.BytesIO()
|
| 271 |
-
with wave.open(wav_buf, "wb") as vfout:
|
| 272 |
-
vfout.setnchannels(channels)
|
| 273 |
-
vfout.setsampwidth(sample_width)
|
| 274 |
-
vfout.setframerate(sample_rate)
|
| 275 |
-
vfout.writeframes(frame_input)
|
| 276 |
-
|
| 277 |
-
wav_buf.seek(0)
|
| 278 |
-
return wav_buf.read()
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
| 282 |
-
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
| 283 |
-
try:
|
| 284 |
-
t0 = time.time()
|
| 285 |
-
chunks = model.inference_stream(
|
| 286 |
-
prompt,
|
| 287 |
-
language,
|
| 288 |
-
gpt_cond_latent,
|
| 289 |
-
speaker_embedding,
|
| 290 |
-
)
|
| 291 |
-
|
| 292 |
-
first_chunk = True
|
| 293 |
-
for i, chunk in enumerate(chunks):
|
| 294 |
-
if first_chunk:
|
| 295 |
-
first_chunk_time = time.time() - t0
|
| 296 |
-
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
| 297 |
-
first_chunk = False
|
| 298 |
-
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
| 299 |
-
|
| 300 |
-
# In case output is required to be multiple voice files
|
| 301 |
-
# out_file = f'{char}_{i}.wav'
|
| 302 |
-
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
| 303 |
-
# audio = AudioSegment.from_file(out_file)
|
| 304 |
-
# audio.export(out_file, format='wav')
|
| 305 |
-
# return out_file
|
| 306 |
-
# directly return chunk as bytes for streaming
|
| 307 |
-
chunk = chunk.detach().cpu().numpy().squeeze()
|
| 308 |
-
chunk = (chunk * 32767).astype(np.int16)
|
| 309 |
-
|
| 310 |
-
yield chunk.tobytes()
|
| 311 |
-
|
| 312 |
-
except RuntimeError as e:
|
| 313 |
-
if "device-side assert" in str(e):
|
| 314 |
-
# cannot do anything on cuda device side error, need tor estart
|
| 315 |
-
print(
|
| 316 |
-
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
| 317 |
-
flush=True,
|
| 318 |
-
)
|
| 319 |
-
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
| 320 |
-
print("Cuda device-assert Runtime encountered need restart")
|
| 321 |
-
|
| 322 |
-
# HF Space specific.. This error is unrecoverable need to restart space
|
| 323 |
-
api.restart_space(repo_id=repo_id)
|
| 324 |
-
else:
|
| 325 |
-
print("RuntimeError: non device-side assert error:", str(e))
|
| 326 |
-
# Does not require warning happens on empty chunk and at end
|
| 327 |
-
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
| 328 |
-
return None
|
| 329 |
-
return None
|
| 330 |
-
except:
|
| 331 |
-
return None
|
| 332 |
-
|
| 333 |
-
|
| 334 |
def get_sentence(history, system_prompt=""):
|
| 335 |
history = [["", None]] if history is None else history
|
| 336 |
|
|
@@ -368,7 +369,6 @@ def get_sentence(history, system_prompt=""):
|
|
| 368 |
yield (sentence, history)
|
| 369 |
|
| 370 |
# return that final sentence token
|
| 371 |
-
# TODO need a counter that one may be replica as before
|
| 372 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
| 373 |
sentence_hash = hash(last_sentence)
|
| 374 |
if sentence_hash not in sentence_hash_list:
|
|
@@ -378,7 +378,8 @@ def get_sentence(history, system_prompt=""):
|
|
| 378 |
|
| 379 |
yield (last_sentence, history)
|
| 380 |
|
| 381 |
-
|
|
|
|
| 382 |
def generate_speech(history):
|
| 383 |
language = "en"
|
| 384 |
|
|
@@ -402,9 +403,8 @@ def generate_speech(history):
|
|
| 402 |
print("Sentence for speech:", sentence)
|
| 403 |
|
| 404 |
try:
|
| 405 |
-
#
|
| 406 |
-
|
| 407 |
-
if len(sentence) > 250:
|
| 408 |
gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
|
| 409 |
# should not generate voice it will hit token limit
|
| 410 |
# It should not generate audio for it
|
|
@@ -413,6 +413,8 @@ def generate_speech(history):
|
|
| 413 |
audio_stream = get_voice_streaming(
|
| 414 |
sentence, language, latent_map["Female_Voice"]
|
| 415 |
)
|
|
|
|
|
|
|
| 416 |
if audio_stream is not None:
|
| 417 |
wav_chunks = wave_header_chunk()
|
| 418 |
frame_length = 0
|
|
@@ -485,7 +487,8 @@ def generate_speech(history):
|
|
| 485 |
yield (gr.Audio.update(value=None, autoplay=False), history)
|
| 486 |
yield (gr.Audio.update(value=outfile, autoplay=False), history)
|
| 487 |
|
| 488 |
-
|
|
|
|
| 489 |
with gr.Blocks(title=title) as demo:
|
| 490 |
gr.Markdown(DESCRIPTION)
|
| 491 |
|
|
@@ -547,7 +550,9 @@ It relies on 3 models:
|
|
| 547 |
3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
| 548 |
|
| 549 |
Note:
|
| 550 |
-
- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
|
|
|
|
|
|
|
| 551 |
)
|
| 552 |
demo.queue()
|
| 553 |
demo.launch(debug=True)
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
###### COQUI TTS FUNCTIONS ######
|
| 123 |
+
|
| 124 |
def get_latents(speaker_wav):
|
| 125 |
# create as function as we can populate here with voice cleanup/filtering
|
| 126 |
(
|
|
|
|
| 130 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 131 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 132 |
|
| 133 |
+
def get_latents(speaker_wav):
|
| 134 |
+
# Generate speaker embedding and latents for TTS
|
| 135 |
+
(
|
| 136 |
+
gpt_cond_latent,
|
| 137 |
+
diffusion_conditioning,
|
| 138 |
+
speaker_embedding,
|
| 139 |
+
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 140 |
+
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
latent_map = {}
|
| 144 |
+
latent_map["Female_Voice"] = get_latents("examples/female.wav")
|
| 145 |
+
|
| 146 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
| 147 |
+
# This will create a wave header then append the frame input
|
| 148 |
+
# It should be first on a streaming wav file
|
| 149 |
+
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
| 150 |
+
wav_buf = io.BytesIO()
|
| 151 |
+
with wave.open(wav_buf, "wb") as vfout:
|
| 152 |
+
vfout.setnchannels(channels)
|
| 153 |
+
vfout.setsampwidth(sample_width)
|
| 154 |
+
vfout.setframerate(sample_rate)
|
| 155 |
+
vfout.writeframes(frame_input)
|
| 156 |
+
|
| 157 |
+
wav_buf.seek(0)
|
| 158 |
+
return wav_buf.read()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
| 162 |
+
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
| 163 |
+
try:
|
| 164 |
+
t0 = time.time()
|
| 165 |
+
chunks = model.inference_stream(
|
| 166 |
+
prompt,
|
| 167 |
+
language,
|
| 168 |
+
gpt_cond_latent,
|
| 169 |
+
speaker_embedding,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
first_chunk = True
|
| 173 |
+
for i, chunk in enumerate(chunks):
|
| 174 |
+
if first_chunk:
|
| 175 |
+
first_chunk_time = time.time() - t0
|
| 176 |
+
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
| 177 |
+
first_chunk = False
|
| 178 |
+
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
| 179 |
+
|
| 180 |
+
# In case output is required to be multiple voice files
|
| 181 |
+
# out_file = f'{char}_{i}.wav'
|
| 182 |
+
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
| 183 |
+
# audio = AudioSegment.from_file(out_file)
|
| 184 |
+
# audio.export(out_file, format='wav')
|
| 185 |
+
# return out_file
|
| 186 |
+
# directly return chunk as bytes for streaming
|
| 187 |
+
chunk = chunk.detach().cpu().numpy().squeeze()
|
| 188 |
+
chunk = (chunk * 32767).astype(np.int16)
|
| 189 |
+
|
| 190 |
+
yield chunk.tobytes()
|
| 191 |
|
| 192 |
+
except RuntimeError as e:
|
| 193 |
+
if "device-side assert" in str(e):
|
| 194 |
+
# cannot do anything on cuda device side error, need tor estart
|
| 195 |
+
print(
|
| 196 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
| 197 |
+
flush=True,
|
| 198 |
+
)
|
| 199 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
| 200 |
+
print("Cuda device-assert Runtime encountered need restart")
|
| 201 |
+
|
| 202 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
| 203 |
+
api.restart_space(repo_id=repo_id)
|
| 204 |
+
else:
|
| 205 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
| 206 |
+
# Does not require warning happens on empty chunk and at end
|
| 207 |
+
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
| 208 |
+
return None
|
| 209 |
+
return None
|
| 210 |
+
except:
|
| 211 |
+
return None
|
| 212 |
+
|
| 213 |
+
###### MISTRAL FUNCTIONS ######
|
| 214 |
+
|
| 215 |
def format_prompt(message, history):
|
| 216 |
prompt = (
|
| 217 |
"<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
|
|
|
|
| 222 |
prompt += f"[INST] {message} [/INST]"
|
| 223 |
return prompt
|
| 224 |
|
|
|
|
| 225 |
def generate(
|
| 226 |
prompt,
|
| 227 |
history,
|
|
|
|
| 278 |
return output
|
| 279 |
|
| 280 |
|
| 281 |
+
###### WHISPER FUNCTIONS ######
|
| 282 |
+
|
| 283 |
def transcribe(wav_path):
|
| 284 |
try:
|
| 285 |
# get result from whisper and strip it to delete begin and end space
|
|
|
|
| 295 |
|
| 296 |
# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
|
| 297 |
|
| 298 |
+
# Will be triggered on text submit (will send to generate_speech)
|
| 299 |
def add_text(history, text):
|
| 300 |
history = [] if history is None else history
|
| 301 |
history = history + [(text, None)]
|
| 302 |
return history, gr.update(value="", interactive=False)
|
| 303 |
|
| 304 |
+
# Will be triggered on voice submit (will transribe and send to generate_speech)
|
| 305 |
def add_file(history, file):
|
| 306 |
history = [] if history is None else history
|
| 307 |
|
|
|
|
| 330 |
history[-1][1] = character
|
| 331 |
yield history
|
| 332 |
|
| 333 |
+
##### MISTRAL STREAMING Sentence splitter ####
|
| 334 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
def get_sentence(history, system_prompt=""):
|
| 336 |
history = [["", None]] if history is None else history
|
| 337 |
|
|
|
|
| 369 |
yield (sentence, history)
|
| 370 |
|
| 371 |
# return that final sentence token
|
|
|
|
| 372 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
| 373 |
sentence_hash = hash(last_sentence)
|
| 374 |
if sentence_hash not in sentence_hash_list:
|
|
|
|
| 378 |
|
| 379 |
yield (last_sentence, history)
|
| 380 |
|
| 381 |
+
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
| 382 |
+
|
| 383 |
def generate_speech(history):
|
| 384 |
language = "en"
|
| 385 |
|
|
|
|
| 403 |
print("Sentence for speech:", sentence)
|
| 404 |
|
| 405 |
try:
|
| 406 |
+
#TODO this will be better handled in future using textwrap
|
| 407 |
+
if len(sentence) > 300:
|
|
|
|
| 408 |
gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
|
| 409 |
# should not generate voice it will hit token limit
|
| 410 |
# It should not generate audio for it
|
|
|
|
| 413 |
audio_stream = get_voice_streaming(
|
| 414 |
sentence, language, latent_map["Female_Voice"]
|
| 415 |
)
|
| 416 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
| 417 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
| 418 |
if audio_stream is not None:
|
| 419 |
wav_chunks = wave_header_chunk()
|
| 420 |
frame_length = 0
|
|
|
|
| 487 |
yield (gr.Audio.update(value=None, autoplay=False), history)
|
| 488 |
yield (gr.Audio.update(value=outfile, autoplay=False), history)
|
| 489 |
|
| 490 |
+
#### GRADIO INTERFACE ####
|
| 491 |
+
|
| 492 |
with gr.Blocks(title=title) as demo:
|
| 493 |
gr.Markdown(DESCRIPTION)
|
| 494 |
|
|
|
|
| 550 |
3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
| 551 |
|
| 552 |
Note:
|
| 553 |
+
- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
|
| 554 |
+
- Responses generated by chat model should not be assumed correct as this is a demonstration example only
|
| 555 |
+
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
| 556 |
)
|
| 557 |
demo.queue()
|
| 558 |
demo.launch(debug=True)
|