Spaces:

Satyam-Singh
/

Speech-to-speech

Build error

App Files Files Community

Satyam-Singh commited on Oct 17, 2024

Commit

a0e2f2a

verified ·

1 Parent(s): 3f52819

Create app.py

Browse files

Files changed (1) hide show

app.py +130 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+from gtts import gTTS
+from io import BytesIO
+from pydub import AudioSegment
+from pydub.playback import play
+import speech_recognition as sr
+from IPython.display import Audio
+import google.generativeai as genai
+from dotenv import load_dotenv
+import streamlit as st
+# Path to store voice files
+path = "../data/voice/"
+os.makedirs(path, exist_ok=True)
+# 1. Save and play voice created by Google Text-to-Speech (gTTS)
+def text_to_audio(text, filename):
+    tts = gTTS(text)
+    file_path = os.path.join(path, filename)
+    tts.save(file_path)
+    return file_path
+def play_audio(file_path):
+    audio = AudioSegment.from_file(file_path)
+    play(audio)
+# 2. Use microphone to record voice
+def record_audio(duration=4):
+    recognizer = sr.Recognizer()
+    with sr.Microphone() as source:
+        print("Adjusting noise...")
+        recognizer.adjust_for_ambient_noise(source, duration=1)
+        print(f"Recording for {duration} seconds...")
+        recorded_audio = recognizer.listen(source, timeout=duration)
+        print("Done recording.")
+    return recorded_audio
+# 3. Convert the recorded voice to text through speech-to-text (STT)
+def audio_to_text(audio):
+    recognizer = sr.Recognizer()
+    try:
+        print("Recognizing the text...")
+        text = recognizer.recognize_google(audio, language="en-US")
+        print("Decoded Text: {}".format(text))
+    except sr.UnknownValueError:
+        text = "Google Speech Recognition could not understand the audio."
+    except sr.RequestError:
+        text = "Could not request results from Google Speech Recognition service."
+    return text
+# 4. Convert the text to voice through text-to-speech (TTS)
+def text_to_speech(text):
+    tts = gTTS(text)
+    audio_buffer = BytesIO()
+    tts.write_to_fp(audio_buffer)
+    audio_buffer.seek(0)
+    audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
+    play(audio_segment)
+# 5. Make a voice-to-voice stream
+def voice_to_voice():
+    recorded_audio = record_audio()
+    text = audio_to_text(recorded_audio)
+    text_to_speech(text)
+# 6. Integrate an LLM to respond to voice input with voice output
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+genai.configure(api_key=GOOGLE_API_KEY)
+gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")
+def respond_by_gemini(input_text, role_text, instructions_text):
+    final_prompt = [
+        "ROLE: " + role_text,
+        "INPUT_TEXT: " + input_text,
+        instructions_text,
+    ]
+    response = gemini_pro.generate_content(
+        final_prompt,
+        stream=True,
+    )
+    response_list = []
+    for chunk in response:
+        response_list.append(chunk.text)
+    response_text = "".join(response_list)
+    return response_text
+def llm_voice_response():
+    role = 'You are an intelligent assistant to chat on the topic: `{}`.'
+    topic = 'The future of artificial intelligence'
+    role_text = role.format(topic)
+    instructions = 'Respond to the INPUT_TEXT briefly in chat style. Respond based on your knowledge about `{}` in brief chat style.'
+    instructions_text = instructions.format(topic)
+    recorded_audio = record_audio()
+    text = audio_to_text(recorded_audio)
+    response_text = text
+    if text not in ["Google Speech Recognition could not understand the audio.", "Could not request results from Google Speech Recognition service."]:
+        response_text = respond_by_gemini(text, role_text, instructions_text)
+    text_to_speech(response_text)
+# 7. Build a Web interface for the LLM-supported voice assistant
+def main():
+    # Streamlit setup with custom CSS
+    st.set_page_config(page_title="LLM-Supported Voice Assistant", layout="wide")
+    st.markdown("""
+        <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
+        <style>
+            .main {background-color: #f5f5f5;}
+            .container {max-width: 800px; margin: auto; padding-top: 50px;}
+            .title {font-family: 'Arial', sans-serif; color: #333333; margin-bottom: 30px;}
+            .btn {background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; font-size: 16px;}
+            .btn:hover {background-color: #45a049;}
+        </style>
+    """, unsafe_allow_html=True)
+    st.markdown("<div class='container'><h1 class='title'>LLM-Supported Voice Assistant</h1></div>", unsafe_allow_html=True)
+    st.write("This is a voice assistant with LLM support. Speak to the microphone, and the assistant will respond.")
+    if st.button("Record and Get Response", key="record_btn"):
+        st.write("Listening...")
+        llm_voice_response()
+        st.write("Done.")
+    st.markdown("<div class='container'><h5>Press the button and speak to the microphone. The assistant will generate a response based on the input and speak it out loud.</h5></div>", unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()