Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

gorkemgoknar commited on Oct 18, 2023

Commit

6af5041

1 Parent(s): 168536f

add some comments and remove unnecessary comemnts

Browse files

Files changed (1) hide show

app.py +99 -94

app.py CHANGED Viewed

@@ -120,6 +120,7 @@ text_client = InferenceClient(
 ###### COQUI TTS FUNCTIONS ######
 def get_latents(speaker_wav):
     # create as function as we can populate here with voice cleanup/filtering
     (
@@ -129,7 +130,88 @@ def get_latents(speaker_wav):
     ) = model.get_conditioning_latents(audio_path=speaker_wav)
     return gpt_cond_latent, diffusion_conditioning, speaker_embedding
 def format_prompt(message, history):
     prompt = (
         "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
@@ -140,7 +222,6 @@ def format_prompt(message, history):
     prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(
     prompt,
     history,
@@ -197,6 +278,8 @@ def generate(
     return output
 def transcribe(wav_path):
     try:
         # get result from whisper and strip it to delete begin and end space
@@ -212,13 +295,13 @@ def transcribe(wav_path):
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
 def add_text(history, text):
     history = [] if history is None else history
     history = history + [(text, None)]
     return history, gr.update(value="", interactive=False)
 def add_file(history, file):
     history = [] if history is None else history
@@ -247,90 +330,8 @@ def bot(history, system_prompt=""):
         history[-1][1] = character
         yield history
-def get_latents(speaker_wav):
-    # Generate speaker embedding and latents for TTS
-    (
-        gpt_cond_latent,
-        diffusion_conditioning,
-        speaker_embedding,
-    ) = model.get_conditioning_latents(audio_path=speaker_wav)
-    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
-latent_map = {}
-latent_map["Female_Voice"] = get_latents("examples/female.wav")
-def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
-    # This will create a wave header then append the frame input
-    # It should be first on a streaming wav file
-    # Other frames better should not have it (else you will hear some artifacts each chunk start)
-    wav_buf = io.BytesIO()
-    with wave.open(wav_buf, "wb") as vfout:
-        vfout.setnchannels(channels)
-        vfout.setsampwidth(sample_width)
-        vfout.setframerate(sample_rate)
-        vfout.writeframes(frame_input)
-    wav_buf.seek(0)
-    return wav_buf.read()
-def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
-    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
-    try:
-        t0 = time.time()
-        chunks = model.inference_stream(
-            prompt,
-            language,
-            gpt_cond_latent,
-            speaker_embedding,
-        )
-        first_chunk = True
-        for i, chunk in enumerate(chunks):
-            if first_chunk:
-                first_chunk_time = time.time() - t0
-                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
-                first_chunk = False
-            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
-            # In case output is required to be multiple voice files
-            # out_file = f'{char}_{i}.wav'
-            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
-            # audio = AudioSegment.from_file(out_file)
-            # audio.export(out_file, format='wav')
-            # return out_file
-            # directly return chunk as bytes for streaming
-            chunk = chunk.detach().cpu().numpy().squeeze()
-            chunk = (chunk * 32767).astype(np.int16)
-            yield chunk.tobytes()
-    except RuntimeError as e:
-        if "device-side assert" in str(e):
-            # cannot do anything on cuda device side error, need tor estart
-            print(
-                f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
-                flush=True,
-            )
-            gr.Warning("Unhandled Exception encounter, please retry in a minute")
-            print("Cuda device-assert Runtime encountered need restart")
-            # HF Space specific.. This error is unrecoverable need to restart space
-            api.restart_space(repo_id=repo_id)
-        else:
-            print("RuntimeError: non device-side assert error:", str(e))
-            # Does not require warning happens on empty chunk and at end
-            ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
-            return None
-        return None
-    except:
-        return None
 def get_sentence(history, system_prompt=""):
     history = [["", None]] if history is None else history
@@ -368,7 +369,6 @@ def get_sentence(history, system_prompt=""):
                 yield (sentence, history)
     # return that final sentence token
-    # TODO need a counter that one may be replica as before
     last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
     sentence_hash = hash(last_sentence)
     if sentence_hash not in sentence_hash_list:
@@ -378,7 +378,8 @@ def get_sentence(history, system_prompt=""):
         yield (last_sentence, history)
 def generate_speech(history):
     language = "en"
@@ -402,9 +403,8 @@ def generate_speech(history):
         print("Sentence for speech:", sentence)
         try:
-            # generate speech using precomputed latents
-            # This is not streaming but it will be fast
-            if len(sentence) > 250:
                 gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
                 # should not generate voice it will hit token limit
                 # It should not generate audio for it
@@ -413,6 +413,8 @@ def generate_speech(history):
                 audio_stream = get_voice_streaming(
                     sentence, language, latent_map["Female_Voice"]
                 )
             if audio_stream is not None:
                 wav_chunks = wave_header_chunk()
                 frame_length = 0
@@ -485,7 +487,8 @@ def generate_speech(history):
     yield (gr.Audio.update(value=None, autoplay=False), history)
     yield (gr.Audio.update(value=outfile, autoplay=False), history)
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
@@ -547,7 +550,9 @@ It relies on 3 models:
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
-- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
     )
 demo.queue()
 demo.launch(debug=True)

 ###### COQUI TTS FUNCTIONS ######
 def get_latents(speaker_wav):
     # create as function as we can populate here with voice cleanup/filtering
     (
     ) = model.get_conditioning_latents(audio_path=speaker_wav)
     return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+def get_latents(speaker_wav):
+    # Generate speaker embedding and latents for TTS
+    (
+        gpt_cond_latent,
+        diffusion_conditioning,
+        speaker_embedding,
+    ) = model.get_conditioning_latents(audio_path=speaker_wav)
+    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+latent_map = {}
+latent_map["Female_Voice"] = get_latents("examples/female.wav")
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
+    # This will create a wave header then append the frame input
+    # It should be first on a streaming wav file
+    # Other frames better should not have it (else you will hear some artifacts each chunk start)
+    wav_buf = io.BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    return wav_buf.read()
+def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
+    try:
+        t0 = time.time()
+        chunks = model.inference_stream(
+            prompt,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+        )
+        first_chunk = True
+        for i, chunk in enumerate(chunks):
+            if first_chunk:
+                first_chunk_time = time.time() - t0
+                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
+                first_chunk = False
+            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            # In case output is required to be multiple voice files
+            # out_file = f'{char}_{i}.wav'
+            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
+            # audio = AudioSegment.from_file(out_file)
+            # audio.export(out_file, format='wav')
+            # return out_file
+            # directly return chunk as bytes for streaming
+            chunk = chunk.detach().cpu().numpy().squeeze()
+            chunk = (chunk * 32767).astype(np.int16)
+            yield chunk.tobytes()
+    except RuntimeError as e:
+        if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need tor estart
+            print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
+                flush=True,
+            )
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable need to restart space
+            api.restart_space(repo_id=repo_id)
+        else:
+            print("RuntimeError: non device-side assert error:", str(e))
+            # Does not require warning happens on empty chunk and at end
+            ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            return None
+        return None
+    except:
+        return None
+###### MISTRAL FUNCTIONS ######
 def format_prompt(message, history):
     prompt = (
         "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
     prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(
     prompt,
     history,
     return output
+###### WHISPER FUNCTIONS ######
 def transcribe(wav_path):
     try:
         # get result from whisper and strip it to delete begin and end space
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
+# Will be triggered on text submit (will send to generate_speech)
 def add_text(history, text):
     history = [] if history is None else history
     history = history + [(text, None)]
     return history, gr.update(value="", interactive=False)
+# Will be triggered on voice submit (will transribe and send to generate_speech)
 def add_file(history, file):
     history = [] if history is None else history
         history[-1][1] = character
         yield history
+##### MISTRAL STREAMING Sentence splitter ####
 def get_sentence(history, system_prompt=""):
     history = [["", None]] if history is None else history
                 yield (sentence, history)
     # return that final sentence token
     last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
     sentence_hash = hash(last_sentence)
     if sentence_hash not in sentence_hash_list:
         yield (last_sentence, history)
+#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
 def generate_speech(history):
     language = "en"
         print("Sentence for speech:", sentence)
         try:
+            #TODO this will be better handled in future using textwrap
+            if len(sentence) > 300:
                 gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
                 # should not generate voice it will hit token limit
                 # It should not generate audio for it
                 audio_stream = get_voice_streaming(
                     sentence, language, latent_map["Female_Voice"]
                 )
+            # XTTS is actually using streaming response but we are playing audio by sentence
+            # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 wav_chunks = wave_header_chunk()
                 frame_length = 0
     yield (gr.Audio.update(value=None, autoplay=False), history)
     yield (gr.Audio.update(value=outfile, autoplay=False), history)
+#### GRADIO INTERFACE ####
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
+- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
+- Responses generated by chat model should not be assumed correct as this is a demonstration example only
+- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
     )
 demo.queue()
 demo.launch(debug=True)