accessible-mistral

Paused

App Files Files Community

gorkemgoknar commited on Oct 19, 2023

Commit

0a0b1ab

1 Parent(s): f31f07e

improvements

Browse files

Files changed (1) hide show

app.py +96 -64

app.py CHANGED Viewed

@@ -11,8 +11,9 @@ import gradio as gr
 import numpy as np
 import torch
 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
 import uuid
 import datetime
@@ -33,9 +34,10 @@ from TTS.utils.generic_utils import get_user_data_dir
 # For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
 # if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
 DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
@@ -73,7 +75,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "ylacombe/voice-chat-with-mistral"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -94,6 +96,7 @@ system_understand_message = os.environ.get(
     "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
 )
 temperature = 0.9
 top_p = 0.6
@@ -157,9 +160,28 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
     wav_buf.seek(0)
     return wav_buf.read()
 def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
     try:
         t0 = time.time()
         chunks = model.inference_stream(
@@ -381,7 +403,7 @@ def get_sentence(history, system_prompt=""):
 #### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
 def generate_speech(history):
-    language = "en"
     wav_bytestream = b""
     for sentence, history in get_sentence(history):
@@ -403,65 +425,75 @@ def generate_speech(history):
         print("Sentence for speech:", sentence)
         try:
-            #TODO this will be better handled in future using textwrap
-            if len(sentence) > 300:
-                gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
-                # should not generate voice it will hit token limit
-                # It should not generate audio for it
-                audio_stream = None
             else:
-                audio_stream = get_voice_streaming(
-                    sentence, language, latent_map["Female_Voice"]
-                )
-            # XTTS is actually using streaming response but we are playing audio by sentence
-            # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
-            if audio_stream is not None:
-                wav_chunks = wave_header_chunk()
-                frame_length = 0
-                for chunk in audio_stream:
-                    try:
-                        wav_bytestream += chunk
-                        if DIRECT_STREAM:
-                            yield (
-                                gr.Audio.update(
-                                    value=wave_header_chunk() + chunk, autoplay=True
-                                ),
-                                history,
-                            )
-                            wait_time = len(chunk) / 2 / 24000
-                            wait_time = AUDIO_WAIT_MODIFIER * wait_time
-                            print("Sleeping till chunk end")
-                            time.sleep(wait_time)
-                        else:
-                            wav_chunks += chunk
-                            frame_length += len(chunk)
-                    except:
-                        # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
-                        continue
-            if not DIRECT_STREAM:
-                yield (
-                    gr.Audio.update(value=None, autoplay=True),
-                    history,
-                )  # hack to switch autoplay
-                if audio_stream is not None:
-                    yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
-                    # Streaming wait time calculation
-                    # audio_length = frame_length / sample_width/ frame_rate
-                    wait_time = frame_length / 2 / 24000
-                    # for non streaming
-                    # wait_time= librosa.get_duration(path=wav)
-                    wait_time = AUDIO_WAIT_MODIFIER * wait_time
-                    print("Sleeping till audio end")
-                    time.sleep(wait_time)
                 else:
-                    # Either too much text or some programming, give a silence so stream continues
-                    second_of_silence = AudioSegment.silent()  # use default
-                    second_of_silence.export("sil.wav", format="wav")
-                    yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
         except RuntimeError as e:
             if "device-side assert" in str(e):
@@ -479,7 +511,7 @@ def generate_speech(history):
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
-    time.sleep(1.0)
     wav_bytestream = wave_header_chunk() + wav_bytestream
     outfile = "combined.wav"
     with open(outfile, "wb") as f:
@@ -495,7 +527,7 @@ with gr.Blocks(title=title) as demo:
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
-        avatar_images=("examples/hf-logo.png", "examples/coqui-logo.png"),
         bubble_full_width=False,
     )

 import numpy as np
 import torch
 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
+import langid
 import uuid
 import datetime
 # For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
+print("AUDIO_WAIT_MODIFIER set to",AUDIO_WAIT_MODIFIER)
 # if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
 DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
+print("DIRECT_STREAM set to",DIRECT_STREAM)
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "coqui/voice-chat-with-mistral"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
     "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
 )
+print("Mistral system message set as:", default_system_message)
 temperature = 0.9
 top_p = 0.6
     wav_buf.seek(0)
     return wav_buf.read()
+xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
 def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
+    # Fast language autodetection
+    if len(prompt)>15 and language=="autodetect":
+        language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
+        if language_predicted == "zh":
+            #we use zh-cn on xtts
+            language_predicted = "zh-cn"
+        if language_predicted not in xtts_supported_languages:
+            print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
+            gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
+            language= "en"
+        else:
+            language = language_predicted
+        print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
+    else:
+        # Hard to detect language fast in short sentence, use english default
+        language = "en"
+        print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
     try:
         t0 = time.time()
         chunks = model.inference_stream(
 #### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
 def generate_speech(history):
+    language = "autodetect"
     wav_bytestream = b""
     for sentence, history in get_sentence(history):
         print("Sentence for speech:", sentence)
         try:
+            if len(sentence)<300:
+                # no problem continue on
+                sentence_list = [sentence]
             else:
+                # Until now nltk likely split sentences properly but we need additional
+                # check for longer sentence and split at last possible position
+                # Do whatever necessary, first break at hypens then spaces and then even split very long words
+                sentence_list=textwrap(sentence,300)
+                print("SPLITTED LONG SENTENCE:",sentence_list)
+            for sentence in sentence_list:
+                if any(c.isalnum() for c in sentence):
+                    #exists at least 1 alphanumeric (utf-8)
+                    audio_stream = get_voice_streaming(
+                            sentence, language, latent_map["Female_Voice"]
+                        )
                 else:
+                    # likely got a ' or " or some other text without alphanumeric in it
+                    audio_stream = None
+                # XTTS is actually using streaming response but we are playing audio by sentence
+                # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
+                if audio_stream is not None:
+                    wav_chunks = wave_header_chunk()
+                    frame_length = 0
+                    for chunk in audio_stream:
+                        try:
+                            wav_bytestream += chunk
+                            if DIRECT_STREAM:
+                                yield (
+                                    gr.Audio.update(
+                                        value=wave_header_chunk() + chunk, autoplay=True
+                                    ),
+                                    history,
+                                )
+                                wait_time = len(chunk) / 2 / 24000
+                                wait_time = AUDIO_WAIT_MODIFIER * wait_time
+                                print("Sleeping till chunk end")
+                                time.sleep(wait_time)
+                            else:
+                                wav_chunks += chunk
+                                frame_length += len(chunk)
+                        except:
+                            # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
+                            continue
+                if not DIRECT_STREAM:
+                    yield (
+                        gr.Audio.update(value=None, autoplay=True),
+                        history,
+                    )  # hack to switch autoplay
+                    if audio_stream is not None:
+                        yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
+                        # Streaming wait time calculation
+                        # audio_length = frame_length / sample_width/ frame_rate
+                        wait_time = frame_length / 2 / 24000
+                        # for non streaming
+                        # wait_time= librosa.get_duration(path=wav)
+                        wait_time = AUDIO_WAIT_MODIFIER * wait_time
+                        print("Sleeping till audio end")
+                        time.sleep(wait_time)
+                    else:
+                        # Either too much text or some programming, give a silence so stream continues
+                        second_of_silence = AudioSegment.silent()  # use default
+                        second_of_silence.export("sil.wav", format="wav")
+                        yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
         except RuntimeError as e:
             if "device-side assert" in str(e):
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
+    time.sleep(1.5)
     wav_bytestream = wave_header_chunk() + wav_bytestream
     outfile = "combined.wav"
     with open(outfile, "wb") as f:
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
+        avatar_images=("examples/mirror.png", "examples/coqui-logo.png"),
         bubble_full_width=False,
     )