Commit
·
0a0b1ab
1
Parent(s):
f31f07e
improvements
Browse files
app.py
CHANGED
|
@@ -11,8 +11,9 @@ import gradio as gr
|
|
| 11 |
import numpy as np
|
| 12 |
import torch
|
| 13 |
import nltk # we'll use this to split into sentences
|
| 14 |
-
|
| 15 |
nltk.download("punkt")
|
|
|
|
|
|
|
| 16 |
import uuid
|
| 17 |
|
| 18 |
import datetime
|
|
@@ -33,9 +34,10 @@ from TTS.utils.generic_utils import get_user_data_dir
|
|
| 33 |
# For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
|
| 34 |
# Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
|
| 35 |
AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
|
| 36 |
-
|
| 37 |
# if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
|
| 38 |
DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
|
|
|
|
| 39 |
|
| 40 |
# This will trigger downloading model
|
| 41 |
print("Downloading if not downloaded Coqui XTTS V1")
|
|
@@ -73,7 +75,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
| 73 |
# will use api to restart space on a unrecoverable error
|
| 74 |
api = HfApi(token=HF_TOKEN)
|
| 75 |
|
| 76 |
-
repo_id = "
|
| 77 |
|
| 78 |
default_system_message = """
|
| 79 |
You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
|
|
@@ -94,6 +96,7 @@ system_understand_message = os.environ.get(
|
|
| 94 |
"SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
|
| 95 |
)
|
| 96 |
|
|
|
|
| 97 |
|
| 98 |
temperature = 0.9
|
| 99 |
top_p = 0.6
|
|
@@ -157,9 +160,28 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
| 157 |
wav_buf.seek(0)
|
| 158 |
return wav_buf.read()
|
| 159 |
|
| 160 |
-
|
| 161 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
| 162 |
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
try:
|
| 164 |
t0 = time.time()
|
| 165 |
chunks = model.inference_stream(
|
|
@@ -381,7 +403,7 @@ def get_sentence(history, system_prompt=""):
|
|
| 381 |
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
| 382 |
|
| 383 |
def generate_speech(history):
|
| 384 |
-
language = "
|
| 385 |
|
| 386 |
wav_bytestream = b""
|
| 387 |
for sentence, history in get_sentence(history):
|
|
@@ -403,65 +425,75 @@ def generate_speech(history):
|
|
| 403 |
print("Sentence for speech:", sentence)
|
| 404 |
|
| 405 |
try:
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
# should not generate voice it will hit token limit
|
| 410 |
-
# It should not generate audio for it
|
| 411 |
-
audio_stream = None
|
| 412 |
else:
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
yield (
|
| 426 |
-
gr.Audio.update(
|
| 427 |
-
value=wave_header_chunk() + chunk, autoplay=True
|
| 428 |
-
),
|
| 429 |
-
history,
|
| 430 |
-
)
|
| 431 |
-
wait_time = len(chunk) / 2 / 24000
|
| 432 |
-
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
| 433 |
-
print("Sleeping till chunk end")
|
| 434 |
-
time.sleep(wait_time)
|
| 435 |
-
|
| 436 |
-
else:
|
| 437 |
-
wav_chunks += chunk
|
| 438 |
-
frame_length += len(chunk)
|
| 439 |
-
except:
|
| 440 |
-
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
| 441 |
-
continue
|
| 442 |
-
|
| 443 |
-
if not DIRECT_STREAM:
|
| 444 |
-
yield (
|
| 445 |
-
gr.Audio.update(value=None, autoplay=True),
|
| 446 |
-
history,
|
| 447 |
-
) # hack to switch autoplay
|
| 448 |
-
if audio_stream is not None:
|
| 449 |
-
yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
|
| 450 |
-
# Streaming wait time calculation
|
| 451 |
-
# audio_length = frame_length / sample_width/ frame_rate
|
| 452 |
-
wait_time = frame_length / 2 / 24000
|
| 453 |
-
|
| 454 |
-
# for non streaming
|
| 455 |
-
# wait_time= librosa.get_duration(path=wav)
|
| 456 |
-
|
| 457 |
-
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
| 458 |
-
print("Sleeping till audio end")
|
| 459 |
-
time.sleep(wait_time)
|
| 460 |
else:
|
| 461 |
-
#
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
except RuntimeError as e:
|
| 467 |
if "device-side assert" in str(e):
|
|
@@ -479,7 +511,7 @@ def generate_speech(history):
|
|
| 479 |
print("RuntimeError: non device-side assert error:", str(e))
|
| 480 |
raise e
|
| 481 |
|
| 482 |
-
time.sleep(1.
|
| 483 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
| 484 |
outfile = "combined.wav"
|
| 485 |
with open(outfile, "wb") as f:
|
|
@@ -495,7 +527,7 @@ with gr.Blocks(title=title) as demo:
|
|
| 495 |
chatbot = gr.Chatbot(
|
| 496 |
[],
|
| 497 |
elem_id="chatbot",
|
| 498 |
-
avatar_images=("examples/
|
| 499 |
bubble_full_width=False,
|
| 500 |
)
|
| 501 |
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
import torch
|
| 13 |
import nltk # we'll use this to split into sentences
|
|
|
|
| 14 |
nltk.download("punkt")
|
| 15 |
+
|
| 16 |
+
import langid
|
| 17 |
import uuid
|
| 18 |
|
| 19 |
import datetime
|
|
|
|
| 34 |
# For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
|
| 35 |
# Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
|
| 36 |
AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
|
| 37 |
+
print("AUDIO_WAIT_MODIFIER set to",AUDIO_WAIT_MODIFIER)
|
| 38 |
# if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
|
| 39 |
DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
|
| 40 |
+
print("DIRECT_STREAM set to",DIRECT_STREAM)
|
| 41 |
|
| 42 |
# This will trigger downloading model
|
| 43 |
print("Downloading if not downloaded Coqui XTTS V1")
|
|
|
|
| 75 |
# will use api to restart space on a unrecoverable error
|
| 76 |
api = HfApi(token=HF_TOKEN)
|
| 77 |
|
| 78 |
+
repo_id = "coqui/voice-chat-with-mistral"
|
| 79 |
|
| 80 |
default_system_message = """
|
| 81 |
You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
|
|
|
|
| 96 |
"SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
|
| 97 |
)
|
| 98 |
|
| 99 |
+
print("Mistral system message set as:", default_system_message)
|
| 100 |
|
| 101 |
temperature = 0.9
|
| 102 |
top_p = 0.6
|
|
|
|
| 160 |
wav_buf.seek(0)
|
| 161 |
return wav_buf.read()
|
| 162 |
|
| 163 |
+
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
| 164 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
| 165 |
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
| 166 |
+
|
| 167 |
+
# Fast language autodetection
|
| 168 |
+
if len(prompt)>15 and language=="autodetect":
|
| 169 |
+
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
| 170 |
+
if language_predicted == "zh":
|
| 171 |
+
#we use zh-cn on xtts
|
| 172 |
+
language_predicted = "zh-cn"
|
| 173 |
+
if language_predicted not in xtts_supported_languages:
|
| 174 |
+
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
| 175 |
+
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
| 176 |
+
language= "en"
|
| 177 |
+
else:
|
| 178 |
+
language = language_predicted
|
| 179 |
+
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
|
| 180 |
+
else:
|
| 181 |
+
# Hard to detect language fast in short sentence, use english default
|
| 182 |
+
language = "en"
|
| 183 |
+
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
| 184 |
+
|
| 185 |
try:
|
| 186 |
t0 = time.time()
|
| 187 |
chunks = model.inference_stream(
|
|
|
|
| 403 |
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
| 404 |
|
| 405 |
def generate_speech(history):
|
| 406 |
+
language = "autodetect"
|
| 407 |
|
| 408 |
wav_bytestream = b""
|
| 409 |
for sentence, history in get_sentence(history):
|
|
|
|
| 425 |
print("Sentence for speech:", sentence)
|
| 426 |
|
| 427 |
try:
|
| 428 |
+
if len(sentence)<300:
|
| 429 |
+
# no problem continue on
|
| 430 |
+
sentence_list = [sentence]
|
|
|
|
|
|
|
|
|
|
| 431 |
else:
|
| 432 |
+
# Until now nltk likely split sentences properly but we need additional
|
| 433 |
+
# check for longer sentence and split at last possible position
|
| 434 |
+
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
| 435 |
+
sentence_list=textwrap(sentence,300)
|
| 436 |
+
print("SPLITTED LONG SENTENCE:",sentence_list)
|
| 437 |
+
|
| 438 |
+
for sentence in sentence_list:
|
| 439 |
+
if any(c.isalnum() for c in sentence):
|
| 440 |
+
#exists at least 1 alphanumeric (utf-8)
|
| 441 |
+
audio_stream = get_voice_streaming(
|
| 442 |
+
sentence, language, latent_map["Female_Voice"]
|
| 443 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
else:
|
| 445 |
+
# likely got a ' or " or some other text without alphanumeric in it
|
| 446 |
+
audio_stream = None
|
| 447 |
+
|
| 448 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
| 449 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
| 450 |
+
if audio_stream is not None:
|
| 451 |
+
wav_chunks = wave_header_chunk()
|
| 452 |
+
frame_length = 0
|
| 453 |
+
for chunk in audio_stream:
|
| 454 |
+
try:
|
| 455 |
+
wav_bytestream += chunk
|
| 456 |
+
if DIRECT_STREAM:
|
| 457 |
+
yield (
|
| 458 |
+
gr.Audio.update(
|
| 459 |
+
value=wave_header_chunk() + chunk, autoplay=True
|
| 460 |
+
),
|
| 461 |
+
history,
|
| 462 |
+
)
|
| 463 |
+
wait_time = len(chunk) / 2 / 24000
|
| 464 |
+
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
| 465 |
+
print("Sleeping till chunk end")
|
| 466 |
+
time.sleep(wait_time)
|
| 467 |
+
|
| 468 |
+
else:
|
| 469 |
+
wav_chunks += chunk
|
| 470 |
+
frame_length += len(chunk)
|
| 471 |
+
except:
|
| 472 |
+
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
| 473 |
+
continue
|
| 474 |
+
|
| 475 |
+
if not DIRECT_STREAM:
|
| 476 |
+
yield (
|
| 477 |
+
gr.Audio.update(value=None, autoplay=True),
|
| 478 |
+
history,
|
| 479 |
+
) # hack to switch autoplay
|
| 480 |
+
if audio_stream is not None:
|
| 481 |
+
yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
|
| 482 |
+
# Streaming wait time calculation
|
| 483 |
+
# audio_length = frame_length / sample_width/ frame_rate
|
| 484 |
+
wait_time = frame_length / 2 / 24000
|
| 485 |
+
|
| 486 |
+
# for non streaming
|
| 487 |
+
# wait_time= librosa.get_duration(path=wav)
|
| 488 |
+
|
| 489 |
+
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
| 490 |
+
print("Sleeping till audio end")
|
| 491 |
+
time.sleep(wait_time)
|
| 492 |
+
else:
|
| 493 |
+
# Either too much text or some programming, give a silence so stream continues
|
| 494 |
+
second_of_silence = AudioSegment.silent() # use default
|
| 495 |
+
second_of_silence.export("sil.wav", format="wav")
|
| 496 |
+
yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
|
| 497 |
|
| 498 |
except RuntimeError as e:
|
| 499 |
if "device-side assert" in str(e):
|
|
|
|
| 511 |
print("RuntimeError: non device-side assert error:", str(e))
|
| 512 |
raise e
|
| 513 |
|
| 514 |
+
time.sleep(1.5)
|
| 515 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
| 516 |
outfile = "combined.wav"
|
| 517 |
with open(outfile, "wb") as f:
|
|
|
|
| 527 |
chatbot = gr.Chatbot(
|
| 528 |
[],
|
| 529 |
elem_id="chatbot",
|
| 530 |
+
avatar_images=("examples/mirror.png", "examples/coqui-logo.png"),
|
| 531 |
bubble_full_width=False,
|
| 532 |
)
|
| 533 |
|