Satyam-Singh commited on
Commit
a0e2f2a
·
verified ·
1 Parent(s): 3f52819

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from gtts import gTTS
3
+ from io import BytesIO
4
+ from pydub import AudioSegment
5
+ from pydub.playback import play
6
+ import speech_recognition as sr
7
+ from IPython.display import Audio
8
+ import google.generativeai as genai
9
+ from dotenv import load_dotenv
10
+ import streamlit as st
11
+
12
+ # Path to store voice files
13
+ path = "../data/voice/"
14
+ os.makedirs(path, exist_ok=True)
15
+
16
+ # 1. Save and play voice created by Google Text-to-Speech (gTTS)
17
+ def text_to_audio(text, filename):
18
+ tts = gTTS(text)
19
+ file_path = os.path.join(path, filename)
20
+ tts.save(file_path)
21
+ return file_path
22
+
23
+ def play_audio(file_path):
24
+ audio = AudioSegment.from_file(file_path)
25
+ play(audio)
26
+
27
+ # 2. Use microphone to record voice
28
+ def record_audio(duration=4):
29
+ recognizer = sr.Recognizer()
30
+ with sr.Microphone() as source:
31
+ print("Adjusting noise...")
32
+ recognizer.adjust_for_ambient_noise(source, duration=1)
33
+ print(f"Recording for {duration} seconds...")
34
+ recorded_audio = recognizer.listen(source, timeout=duration)
35
+ print("Done recording.")
36
+ return recorded_audio
37
+
38
+ # 3. Convert the recorded voice to text through speech-to-text (STT)
39
+ def audio_to_text(audio):
40
+ recognizer = sr.Recognizer()
41
+ try:
42
+ print("Recognizing the text...")
43
+ text = recognizer.recognize_google(audio, language="en-US")
44
+ print("Decoded Text: {}".format(text))
45
+ except sr.UnknownValueError:
46
+ text = "Google Speech Recognition could not understand the audio."
47
+ except sr.RequestError:
48
+ text = "Could not request results from Google Speech Recognition service."
49
+ return text
50
+
51
+ # 4. Convert the text to voice through text-to-speech (TTS)
52
+ def text_to_speech(text):
53
+ tts = gTTS(text)
54
+ audio_buffer = BytesIO()
55
+ tts.write_to_fp(audio_buffer)
56
+ audio_buffer.seek(0)
57
+ audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
58
+ play(audio_segment)
59
+
60
+ # 5. Make a voice-to-voice stream
61
+ def voice_to_voice():
62
+ recorded_audio = record_audio()
63
+ text = audio_to_text(recorded_audio)
64
+ text_to_speech(text)
65
+
66
+ # 6. Integrate an LLM to respond to voice input with voice output
67
+ load_dotenv()
68
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
69
+ genai.configure(api_key=GOOGLE_API_KEY)
70
+ gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")
71
+
72
+ def respond_by_gemini(input_text, role_text, instructions_text):
73
+ final_prompt = [
74
+ "ROLE: " + role_text,
75
+ "INPUT_TEXT: " + input_text,
76
+ instructions_text,
77
+ ]
78
+ response = gemini_pro.generate_content(
79
+ final_prompt,
80
+ stream=True,
81
+ )
82
+ response_list = []
83
+ for chunk in response:
84
+ response_list.append(chunk.text)
85
+ response_text = "".join(response_list)
86
+ return response_text
87
+
88
+ def llm_voice_response():
89
+ role = 'You are an intelligent assistant to chat on the topic: `{}`.'
90
+ topic = 'The future of artificial intelligence'
91
+ role_text = role.format(topic)
92
+ instructions = 'Respond to the INPUT_TEXT briefly in chat style. Respond based on your knowledge about `{}` in brief chat style.'
93
+ instructions_text = instructions.format(topic)
94
+
95
+ recorded_audio = record_audio()
96
+ text = audio_to_text(recorded_audio)
97
+ response_text = text
98
+ if text not in ["Google Speech Recognition could not understand the audio.", "Could not request results from Google Speech Recognition service."]:
99
+ response_text = respond_by_gemini(text, role_text, instructions_text)
100
+ text_to_speech(response_text)
101
+
102
+ # 7. Build a Web interface for the LLM-supported voice assistant
103
+ def main():
104
+ # Streamlit setup with custom CSS
105
+ st.set_page_config(page_title="LLM-Supported Voice Assistant", layout="wide")
106
+
107
+ st.markdown("""
108
+ <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
109
+ <style>
110
+ .main {background-color: #f5f5f5;}
111
+ .container {max-width: 800px; margin: auto; padding-top: 50px;}
112
+ .title {font-family: 'Arial', sans-serif; color: #333333; margin-bottom: 30px;}
113
+ .btn {background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; font-size: 16px;}
114
+ .btn:hover {background-color: #45a049;}
115
+ </style>
116
+ """, unsafe_allow_html=True)
117
+
118
+ st.markdown("<div class='container'><h1 class='title'>LLM-Supported Voice Assistant</h1></div>", unsafe_allow_html=True)
119
+
120
+ st.write("This is a voice assistant with LLM support. Speak to the microphone, and the assistant will respond.")
121
+
122
+ if st.button("Record and Get Response", key="record_btn"):
123
+ st.write("Listening...")
124
+ llm_voice_response()
125
+ st.write("Done.")
126
+
127
+ st.markdown("<div class='container'><h5>Press the button and speak to the microphone. The assistant will generate a response based on the input and speak it out loud.</h5></div>", unsafe_allow_html=True)
128
+
129
+ if __name__ == "__main__":
130
+ main()