Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from gtts import gTTS
|
3 |
+
from io import BytesIO
|
4 |
+
from pydub import AudioSegment
|
5 |
+
from pydub.playback import play
|
6 |
+
import speech_recognition as sr
|
7 |
+
from IPython.display import Audio
|
8 |
+
import google.generativeai as genai
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
# Path to store voice files
|
13 |
+
path = "../data/voice/"
|
14 |
+
os.makedirs(path, exist_ok=True)
|
15 |
+
|
16 |
+
# 1. Save and play voice created by Google Text-to-Speech (gTTS)
|
17 |
+
def text_to_audio(text, filename):
|
18 |
+
tts = gTTS(text)
|
19 |
+
file_path = os.path.join(path, filename)
|
20 |
+
tts.save(file_path)
|
21 |
+
return file_path
|
22 |
+
|
23 |
+
def play_audio(file_path):
|
24 |
+
audio = AudioSegment.from_file(file_path)
|
25 |
+
play(audio)
|
26 |
+
|
27 |
+
# 2. Use microphone to record voice
|
28 |
+
def record_audio(duration=4):
|
29 |
+
recognizer = sr.Recognizer()
|
30 |
+
with sr.Microphone() as source:
|
31 |
+
print("Adjusting noise...")
|
32 |
+
recognizer.adjust_for_ambient_noise(source, duration=1)
|
33 |
+
print(f"Recording for {duration} seconds...")
|
34 |
+
recorded_audio = recognizer.listen(source, timeout=duration)
|
35 |
+
print("Done recording.")
|
36 |
+
return recorded_audio
|
37 |
+
|
38 |
+
# 3. Convert the recorded voice to text through speech-to-text (STT)
|
39 |
+
def audio_to_text(audio):
|
40 |
+
recognizer = sr.Recognizer()
|
41 |
+
try:
|
42 |
+
print("Recognizing the text...")
|
43 |
+
text = recognizer.recognize_google(audio, language="en-US")
|
44 |
+
print("Decoded Text: {}".format(text))
|
45 |
+
except sr.UnknownValueError:
|
46 |
+
text = "Google Speech Recognition could not understand the audio."
|
47 |
+
except sr.RequestError:
|
48 |
+
text = "Could not request results from Google Speech Recognition service."
|
49 |
+
return text
|
50 |
+
|
51 |
+
# 4. Convert the text to voice through text-to-speech (TTS)
|
52 |
+
def text_to_speech(text):
|
53 |
+
tts = gTTS(text)
|
54 |
+
audio_buffer = BytesIO()
|
55 |
+
tts.write_to_fp(audio_buffer)
|
56 |
+
audio_buffer.seek(0)
|
57 |
+
audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
|
58 |
+
play(audio_segment)
|
59 |
+
|
60 |
+
# 5. Make a voice-to-voice stream
|
61 |
+
def voice_to_voice():
|
62 |
+
recorded_audio = record_audio()
|
63 |
+
text = audio_to_text(recorded_audio)
|
64 |
+
text_to_speech(text)
|
65 |
+
|
66 |
+
# 6. Integrate an LLM to respond to voice input with voice output
|
67 |
+
load_dotenv()
|
68 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
69 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
70 |
+
gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")
|
71 |
+
|
72 |
+
def respond_by_gemini(input_text, role_text, instructions_text):
|
73 |
+
final_prompt = [
|
74 |
+
"ROLE: " + role_text,
|
75 |
+
"INPUT_TEXT: " + input_text,
|
76 |
+
instructions_text,
|
77 |
+
]
|
78 |
+
response = gemini_pro.generate_content(
|
79 |
+
final_prompt,
|
80 |
+
stream=True,
|
81 |
+
)
|
82 |
+
response_list = []
|
83 |
+
for chunk in response:
|
84 |
+
response_list.append(chunk.text)
|
85 |
+
response_text = "".join(response_list)
|
86 |
+
return response_text
|
87 |
+
|
88 |
+
def llm_voice_response():
|
89 |
+
role = 'You are an intelligent assistant to chat on the topic: `{}`.'
|
90 |
+
topic = 'The future of artificial intelligence'
|
91 |
+
role_text = role.format(topic)
|
92 |
+
instructions = 'Respond to the INPUT_TEXT briefly in chat style. Respond based on your knowledge about `{}` in brief chat style.'
|
93 |
+
instructions_text = instructions.format(topic)
|
94 |
+
|
95 |
+
recorded_audio = record_audio()
|
96 |
+
text = audio_to_text(recorded_audio)
|
97 |
+
response_text = text
|
98 |
+
if text not in ["Google Speech Recognition could not understand the audio.", "Could not request results from Google Speech Recognition service."]:
|
99 |
+
response_text = respond_by_gemini(text, role_text, instructions_text)
|
100 |
+
text_to_speech(response_text)
|
101 |
+
|
102 |
+
# 7. Build a Web interface for the LLM-supported voice assistant
|
103 |
+
def main():
|
104 |
+
# Streamlit setup with custom CSS
|
105 |
+
st.set_page_config(page_title="LLM-Supported Voice Assistant", layout="wide")
|
106 |
+
|
107 |
+
st.markdown("""
|
108 |
+
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
|
109 |
+
<style>
|
110 |
+
.main {background-color: #f5f5f5;}
|
111 |
+
.container {max-width: 800px; margin: auto; padding-top: 50px;}
|
112 |
+
.title {font-family: 'Arial', sans-serif; color: #333333; margin-bottom: 30px;}
|
113 |
+
.btn {background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; font-size: 16px;}
|
114 |
+
.btn:hover {background-color: #45a049;}
|
115 |
+
</style>
|
116 |
+
""", unsafe_allow_html=True)
|
117 |
+
|
118 |
+
st.markdown("<div class='container'><h1 class='title'>LLM-Supported Voice Assistant</h1></div>", unsafe_allow_html=True)
|
119 |
+
|
120 |
+
st.write("This is a voice assistant with LLM support. Speak to the microphone, and the assistant will respond.")
|
121 |
+
|
122 |
+
if st.button("Record and Get Response", key="record_btn"):
|
123 |
+
st.write("Listening...")
|
124 |
+
llm_voice_response()
|
125 |
+
st.write("Done.")
|
126 |
+
|
127 |
+
st.markdown("<div class='container'><h5>Press the button and speak to the microphone. The assistant will generate a response based on the input and speak it out loud.</h5></div>", unsafe_allow_html=True)
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
main()
|