rohan13 commited on
Commit
b8b8495
·
1 Parent(s): 1839d37

(cherry picked from commit 314d9665c9ac0eed50d9a471dffef9cb1e665e40)

Files changed (7) hide show
  1. app.py +136 -0
  2. assets/logo.png +0 -0
  3. main.py +13 -0
  4. models/openai_vs.index +0 -0
  5. models/openai_vs.pkl +0 -0
  6. requirements.txt +11 -0
  7. utils.py +271 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import index, run, ingest_files
3
+ from gtts import gTTS
4
+ import os, time
5
+
6
+ from transformers import pipeline
7
+
8
+ p = pipeline("automatic-speech-recognition")
9
+
10
+ """Use text to call chat method from main.py"""
11
+
12
+ models = ["GPT-3.5", "Flan UL2", "Flan T5"]
13
+
14
+ name = os.environ.get("name", "Rohan")
15
+
16
+ def add_text(history, text, model):
17
+ print("Question asked: " + text)
18
+ response = run_model(text, model)
19
+ history = history + [(text, response)]
20
+ print(history)
21
+ return history, ""
22
+
23
+
24
+ def run_model(text, model):
25
+ start_time = time.time()
26
+ print("start time:" + str(start_time))
27
+ response = run(text, model)
28
+ end_time = time.time()
29
+ # If response contains string `SOURCES:`, then add a \n before `SOURCES`
30
+ if "SOURCES:" in response:
31
+ response = response.replace("SOURCES:", "\nSOURCES:")
32
+ # response = response + "\n\n" + "Time taken: " + str(end_time - start_time)
33
+ print(response)
34
+ print("Time taken: " + str(end_time - start_time))
35
+ return response
36
+
37
+
38
+
39
+ def get_output(history, audio, model):
40
+
41
+ txt = p(audio)["text"]
42
+ # history.append(( (audio, ) , txt))
43
+ audio_path = 'response.wav'
44
+ response = run_model(txt, model)
45
+ # Remove all text from SOURCES: to the end of the string
46
+ trimmed_response = response.split("SOURCES:")[0]
47
+ myobj = gTTS(text=trimmed_response, lang='en', slow=False)
48
+ myobj.save(audio_path)
49
+ # split audio by / and keep the last element
50
+ # audio = audio.split("/")[-1]
51
+ # audio = audio + ".wav"
52
+ history.append(( (audio, ) , (audio_path, )))
53
+ print(history)
54
+ return history
55
+
56
+ def set_model(history, model, first_time=False):
57
+ print("Model selected: " + model)
58
+ history = get_first_message(history)
59
+ index(model, first_time)
60
+ return history
61
+
62
+
63
+ def get_first_message(history):
64
+ history = [(None,
65
+ "Hi! I am " + name + "'s Personal Assistant. Want " + name + " to answer your questions? Just Roar it!")]
66
+ return history
67
+
68
+ def clear_audio(audio):
69
+ return None
70
+
71
+ def bot(history):
72
+ return history
73
+
74
+ def upload_file(files, history, model):
75
+ file_paths = [file.name for file in files]
76
+ print("Ingesting files: " + str(file_paths))
77
+ text = 'Uploaded a file'
78
+ if ingest_files(file_paths, model):
79
+ response = 'Files are ingested'
80
+ else:
81
+ response = 'Files are not ingested'
82
+
83
+ history = history + [(text, response)]
84
+ return history
85
+
86
+ theme = gr.Theme.from_hub("snehilsanyal/scikit-learn")
87
+
88
+ theme.block_background_fill = gr.themes.colors.neutral.c200
89
+
90
+
91
+ with gr.Blocks(theme) as demo:
92
+ # Add image of Roar Logo from local directory
93
+ gr.HTML('<img src="file/assets/logo.png" style="width: 100px; height: 100px; margin: 0 auto;border:5px solid orange;border-radius: 50%; display: block">')
94
+ # Title on top in middle of the page
95
+ gr.HTML("<h1 style='text-align: center;'>Roar - A Personal Assistant</h1>")
96
+
97
+ chatbot = gr.Chatbot(get_first_message([]), elem_id="chatbot").style(height=500)
98
+
99
+ with gr.Row():
100
+ # Create radio button to select model
101
+ radio = gr.Radio(models, label="Choose a model", value="GPT-3.5", type="value", visible=False)
102
+ with gr.Row():
103
+ with gr.Column(scale=0.6):
104
+ txt = gr.Textbox(
105
+ label="Rohan Bot",
106
+ placeholder="Enter text and press enter, or upload a file", lines=1
107
+ ).style(container=False)
108
+
109
+ with gr.Column(scale=0.2):
110
+ upload = gr.UploadButton(label="Upload a file", type="file", file_count='multiple', file_types=['docx', 'txt', 'pdf', 'html']).style(container=False)
111
+
112
+ with gr.Column(scale=0.2):
113
+ audio = gr.Audio(source="microphone", type="filepath").style(container=False)
114
+
115
+ with gr.Row():
116
+ gr.Examples(examples=['What are you an expert of?'], inputs=[txt], label="Examples")
117
+
118
+ txt.submit(add_text, [chatbot, txt, radio], [chatbot, txt], postprocess=False).then(
119
+ bot, chatbot, chatbot
120
+ )
121
+
122
+ radio.change(fn=set_model, inputs=[chatbot, radio], outputs=[chatbot]).then(bot, chatbot, chatbot)
123
+
124
+ audio.change(fn=get_output, inputs=[chatbot, audio, radio], outputs=[chatbot, audio], show_progress=True).then(
125
+ bot, chatbot, chatbot, clear_audio
126
+ )
127
+
128
+ upload.upload(upload_file, inputs=[upload, chatbot, radio], outputs=[chatbot]).then(bot, chatbot, chatbot)
129
+
130
+ set_model(chatbot, radio.value, first_time=True)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ demo.queue()
135
+ demo.queue(concurrency_count=5)
136
+ demo.launch(debug=True)
assets/logo.png ADDED
main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_search_index, generate_answer, set_model_and_embeddings, ingest
2
+
3
+ def index(model, first_time=False):
4
+ set_model_and_embeddings(model)
5
+ get_search_index(model, first_time=first_time)
6
+ return True
7
+
8
+ def ingest_files(file_paths, model):
9
+ return ingest(file_paths, model)
10
+
11
+ def run(question, model):
12
+ index(model)
13
+ return generate_answer(question)
models/openai_vs.index ADDED
Binary file (43.1 kB). View file
 
models/openai_vs.pkl ADDED
Binary file (49.6 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ faiss-cpu==1.7.3
4
+ unstructured==0.5.8
5
+ ffmpeg-python
6
+ transformers
7
+ gtts
8
+ torch
9
+ tiktoken
10
+ huggingface-hub
11
+ gradio
utils.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import langchain
4
+
5
+ import faiss
6
+ from langchain import HuggingFaceHub
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader
10
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
11
+ from langchain.memory import ConversationBufferWindowMemory
12
+ from langchain.llms.openai import OpenAI
13
+ from langchain.prompts.chat import (
14
+ ChatPromptTemplate,
15
+ HumanMessagePromptTemplate,
16
+ SystemMessagePromptTemplate,
17
+ )
18
+ from langchain.text_splitter import CharacterTextSplitter
19
+ from langchain.vectorstores.faiss import FAISS
20
+ from langchain.cache import InMemoryCache
21
+ import traceback
22
+
23
+
24
+ langchain.llm_cache = InMemoryCache()
25
+
26
+ global model_name
27
+
28
+ models = ["GPT-3.5", "Flan UL2", "GPT-4", "Flan T5"]
29
+
30
+ pickle_file = "_vs.pkl"
31
+ updated_pickle_file = "_vs_updated.pkl"
32
+ index_file = "_vs.index"
33
+ models_folder = "models/"
34
+
35
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
36
+
37
+ embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
38
+
39
+ chat_history = []
40
+
41
+ memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)
42
+
43
+ vectorstore_index = None
44
+
45
+ # get name to be used in prompt from environment variable `name`
46
+ name = os.environ.get("name", "Rohan")
47
+
48
+ system_template = """You are ROAR, {name}'s personal assistant supposed to ANSWER QUESTIONS ON HIS BEHALF.
49
+ STRICTLY FOLLOW THIS: FOR OPINIONS, PREFERENCES, EXPERIENCES,ALWAYS ANSWER IN FIRST PERSON AS IF YOU ARE {name}. ALWAYS Generate this response with experiences/opinions using {name}'s RESUME available in context/vectorstore.
50
+ Response should be in professional language and tone, impressive, catchy, and grammatically correct.
51
+ Use {name}'s resume and your knowledge of his experience and skills to answer questions to the best of your ability.
52
+ Answer the question as if you are assisting {name} or answering on his behalf.
53
+ ----------------
54
+ This activity of answering questions on {name}'s behalf will be called Roar.
55
+ For example: If someone wants to ask you a question, they will say "Roar it" and you will answer the question on {name}'s behalf by generating a response using {name}'s resume and your knowledge of his experience and skills.
56
+ Add a qwirky and funny line in the end to encourage the user to try more Roars as they are free.
57
+ ----------------
58
+ {context}
59
+ """
60
+ # append name in system template to be used in prompt
61
+ system_template = system_template.format(name=name, context="{context}")
62
+
63
+ messages = [
64
+ SystemMessagePromptTemplate.from_template(system_template),
65
+ HumanMessagePromptTemplate.from_template("{question}"),
66
+ ]
67
+ CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
68
+
69
+
70
+ def set_model_and_embeddings(model):
71
+ global chat_history
72
+ set_model(model)
73
+ # set_embeddings(model)
74
+ chat_history = []
75
+
76
+
77
+ def set_model(model):
78
+ global llm
79
+ print("Setting model to " + str(model))
80
+ if model == "GPT-3.5":
81
+ print("Loading GPT-3.5")
82
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
83
+ elif model == "GPT-4":
84
+ print("Loading GPT-4")
85
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
86
+ elif model == "Flan UL2":
87
+ print("Loading Flan-UL2")
88
+ llm = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature": 0.1, "max_new_tokens":500})
89
+ elif model == "Flan T5":
90
+ print("Loading Flan T5")
91
+ llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0.1})
92
+ else:
93
+ print("Loading GPT-3.5 from else")
94
+ llm = OpenAI(model_name="text-davinci-002", temperature=0.1)
95
+
96
+
97
+ def set_embeddings(model):
98
+ global embeddings
99
+ if model == "GPT-3.5" or model == "GPT-4":
100
+ print("Loading OpenAI embeddings")
101
+ embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
102
+ elif model == "Flan UL2" or model == "Flan T5":
103
+ print("Loading Hugging Face embeddings")
104
+ embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")
105
+
106
+
107
+ def get_search_index(model, first_time=False):
108
+ global vectorstore_index
109
+ if not first_time:
110
+ print("Using updated pickle file")
111
+ file = updated_pickle_file
112
+ else:
113
+ print("Using base pickle file")
114
+ file = pickle_file
115
+ if os.path.isfile(get_file_path(model, file)) and os.path.isfile(
116
+ get_file_path(model, index_file)) and os.path.getsize(get_file_path(model, file)) > 0:
117
+ # Load index from pickle file
118
+ search_index = load_index(model)
119
+ else:
120
+ search_index = create_index(model)
121
+
122
+ vectorstore_index = search_index
123
+ return search_index
124
+
125
+
126
+ def load_index(model):
127
+ with open(get_file_path(model, pickle_file), "rb") as f:
128
+ search_index = pickle.load(f)
129
+ print("Loaded index")
130
+ return search_index
131
+
132
+
133
+ def create_index(model):
134
+ sources = fetch_data_for_embeddings()
135
+ source_chunks = split_docs(sources)
136
+ search_index = search_index_from_docs(source_chunks)
137
+ faiss.write_index(search_index.index, get_file_path(model, index_file))
138
+ # Save index to pickle file
139
+ with open(get_file_path(model, pickle_file), "wb") as f:
140
+ pickle.dump(search_index, f)
141
+ print("Created index")
142
+ return search_index
143
+
144
+
145
+ def get_file_path(model, file):
146
+ # If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
147
+ if model == "GPT-3.5" or model == "GPT-4":
148
+ return models_folder + "openai" + file
149
+ else:
150
+ return models_folder + "hf" + file
151
+
152
+
153
+ def search_index_from_docs(source_chunks):
154
+ # print("source chunks: " + str(len(source_chunks)))
155
+ # print("embeddings: " + str(embeddings))
156
+
157
+ search_index = FAISS.from_documents(source_chunks, embeddings)
158
+ return search_index
159
+
160
+
161
+ def get_html_files():
162
+ loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
163
+ document_list = loader.load()
164
+ return document_list
165
+
166
+
167
+ def fetch_data_for_embeddings():
168
+ document_list = get_word_files()
169
+ document_list.extend(get_html_files())
170
+
171
+ print("document list: " + str(len(document_list)))
172
+ return document_list
173
+
174
+
175
+ def get_word_files():
176
+ loader = DirectoryLoader('docs', glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader, recursive=True)
177
+ document_list = loader.load()
178
+ return document_list
179
+
180
+ def split_docs(docs):
181
+ splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
182
+
183
+ source_chunks = splitter.split_documents(docs)
184
+
185
+ print("chunks: " + str(len(source_chunks)))
186
+
187
+ return source_chunks
188
+
189
+ def load_documents(file_paths):
190
+ # Check the type of file from the extension and load it accordingly
191
+ document_list = []
192
+ for file_path in file_paths:
193
+ if file_path.endswith(".txt"):
194
+ loader = TextLoader(file_path)
195
+ elif file_path.endswith(".docx"):
196
+ loader = UnstructuredWordDocumentLoader(file_path)
197
+ elif file_path.endswith(".html"):
198
+ loader = UnstructuredHTMLLoader(file_path)
199
+ elif file_path.endswith(".pdf"):
200
+ loader = UnstructuredPDFLoader(file_path)
201
+ else:
202
+ print("Unsupported file type")
203
+ raise Exception("Unsupported file type")
204
+ docs = loader.load()
205
+ document_list.extend(docs)
206
+ # print("Loaded " + file_path)
207
+
208
+ print("Loaded " + str(len(document_list)) + " documents")
209
+ return document_list
210
+
211
+ def add_to_index(docs, index, model):
212
+ global vectorstore_index
213
+ index.add_documents(docs)
214
+ with open(get_file_path(model, updated_pickle_file), "wb") as f:
215
+ pickle.dump(index, f)
216
+ vectorstore_index = index
217
+ print("Vetorstore index updated")
218
+ return True
219
+ def ingest(file_paths, model):
220
+ print("Ingesting files")
221
+ try:
222
+ # handle txt, docx, html, pdf
223
+ docs = load_documents(file_paths)
224
+ split_docs(docs)
225
+ add_to_index(docs, vectorstore_index, model)
226
+ print("Ingestion complete")
227
+ except Exception as e:
228
+ traceback.print_exc()
229
+ return False
230
+ return True
231
+
232
+
233
+ def get_qa_chain(vectorstore_index):
234
+ global llm, model_name
235
+ print(llm)
236
+
237
+ # embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
238
+ # compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
239
+ retriever = vectorstore_index.as_retriever(search_type="similarity_score_threshold",
240
+ search_kwargs={"score_threshold": .8})
241
+
242
+ chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
243
+ verbose=True, get_chat_history=get_chat_history,
244
+ combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
245
+ return chain
246
+
247
+
248
+ def get_chat_history(inputs) -> str:
249
+ res = []
250
+ for human, ai in inputs:
251
+ res.append(f"Human:{human}\nAI:{ai}")
252
+ return "\n".join(res)
253
+
254
+
255
+ def generate_answer(question) -> str:
256
+ global chat_history, vectorstore_index
257
+ chain = get_qa_chain(vectorstore_index)
258
+
259
+ result = chain(
260
+ {"question": question, "chat_history": chat_history, "vectordbkwargs": {"search_distance": 0.6}})
261
+ chat_history = [(question, result["answer"])]
262
+ sources = []
263
+ print(result)
264
+
265
+ for document in result['source_documents']:
266
+ # sources.append(document.metadata['url'])
267
+ sources.append(document.metadata['source'].split('/')[-1].split('.')[0])
268
+ print(sources)
269
+
270
+ source = ',\n'.join(set(sources))
271
+ return result['answer'] + '\nSOURCES: ' + source