Spaces:
Build error
Build error
own knowledge gpt
Browse files- .gitignore +3 -0
- README.md +5 -4
- app.py +194 -0
- bot/utils/callbacks.py +17 -0
- bot/utils/constanst.py +8 -0
- bot/utils/show_log.py +12 -0
- bot/web_scrapping/crawler_and_indexer.py +85 -0
- bot/web_scrapping/searchable_index.py +148 -0
- bot/web_scrapping/single_crawler.py +57 -0
- requirements.txt +166 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.idea
|
2 |
+
.DS_Store
|
3 |
+
__pycache__
|
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.2.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Presight GPT
|
3 |
+
emoji: π
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.2.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Tuple
|
2 |
+
from queue import Empty, Queue
|
3 |
+
from threading import Thread
|
4 |
+
from bot.web_scrapping.crawler_and_indexer import content_crawler_and_index
|
5 |
+
from bot.web_scrapping.searchable_index import SearchableIndex
|
6 |
+
from bot.utils.callbacks import QueueCallback
|
7 |
+
from bot.utils.constanst import set_api_key
|
8 |
+
from bot.utils.show_log import logger
|
9 |
+
from langchain.chat_models import ChatOpenAI
|
10 |
+
from langchain.prompts import HumanMessagePromptTemplate
|
11 |
+
from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
set_api_key()
|
15 |
+
MODELS_NAMES = ["gpt-3.5-turbo"]
|
16 |
+
DEFAULT_TEMPERATURE = 0.7
|
17 |
+
|
18 |
+
ChatHistory = List[str]
|
19 |
+
|
20 |
+
default_system_prompt = 'Put your prompt here'
|
21 |
+
default_system_format = 'txt'
|
22 |
+
human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
|
23 |
+
|
24 |
+
|
25 |
+
def learning_feedbacks():
|
26 |
+
return 'Training Completed'
|
27 |
+
|
28 |
+
|
29 |
+
def bot_learning(urls, file_formats, chat_mode=False):
|
30 |
+
index = content_crawler_and_index(url=str(urls), file_format=file_formats)
|
31 |
+
if chat_mode:
|
32 |
+
return index
|
33 |
+
else:
|
34 |
+
fb = learning_feedbacks()
|
35 |
+
return fb
|
36 |
+
|
37 |
+
|
38 |
+
def chat_start(
|
39 |
+
chat: Optional[ChatOpenAI],
|
40 |
+
message: str,
|
41 |
+
chatbot_messages: ChatHistory,
|
42 |
+
messages: List[BaseMessage], ) -> Tuple[str, str, ChatOpenAI, ChatHistory, List[BaseMessage]]:
|
43 |
+
if not chat:
|
44 |
+
queue = Queue()
|
45 |
+
chat = ChatOpenAI(
|
46 |
+
model_name=MODELS_NAMES[0],
|
47 |
+
temperature=DEFAULT_TEMPERATURE,
|
48 |
+
streaming=True,
|
49 |
+
callbacks=([QueueCallback(queue)])
|
50 |
+
)
|
51 |
+
else:
|
52 |
+
queue = chat.callbacks[0].queue
|
53 |
+
|
54 |
+
job_done = object()
|
55 |
+
messages.append(HumanMessage(content=f':{message}'))
|
56 |
+
chatbot_messages.append((message, ""))
|
57 |
+
index = bot_learning(urls='NO_URL', file_formats='txt', chat_mode=True)
|
58 |
+
|
59 |
+
def query_retrieval():
|
60 |
+
response = SearchableIndex.query(message, chat, index)
|
61 |
+
chatbot_message = AIMessage(content=response)
|
62 |
+
messages.append(chatbot_message)
|
63 |
+
queue.put(job_done)
|
64 |
+
|
65 |
+
t = Thread(target=query_retrieval)
|
66 |
+
t.start()
|
67 |
+
content = ""
|
68 |
+
while True:
|
69 |
+
try:
|
70 |
+
next_token = queue.get(True, timeout=1)
|
71 |
+
if next_token is job_done:
|
72 |
+
break
|
73 |
+
content += next_token
|
74 |
+
chatbot_messages[-1] = (message, content)
|
75 |
+
yield chat, "", chatbot_messages, messages
|
76 |
+
except Empty:
|
77 |
+
continue
|
78 |
+
messages.append(AIMessage(content=content))
|
79 |
+
logger.info(f"Done!")
|
80 |
+
return chat, "", chatbot_messages, messages
|
81 |
+
|
82 |
+
|
83 |
+
def system_prompt_handler(value: str) -> str:
|
84 |
+
return value
|
85 |
+
|
86 |
+
|
87 |
+
def on_clear_button_click(system_prompt: str) -> Tuple[str, List, List]:
|
88 |
+
return "", [], [SystemMessage(content=system_prompt)]
|
89 |
+
|
90 |
+
|
91 |
+
def on_apply_settings_button_click(
|
92 |
+
system_prompt: str, model_name: str, temperature: float
|
93 |
+
):
|
94 |
+
logger.info(
|
95 |
+
f"Applying settings: model_name={model_name}, temperature={temperature}"
|
96 |
+
)
|
97 |
+
chat = ChatOpenAI(
|
98 |
+
model_name=model_name,
|
99 |
+
temperature=temperature,
|
100 |
+
streaming=True,
|
101 |
+
callbacks=[QueueCallback(Queue())],
|
102 |
+
max_tokens=1000,
|
103 |
+
)
|
104 |
+
chat.callbacks[0].queue.empty()
|
105 |
+
return chat, *on_clear_button_click(system_prompt)
|
106 |
+
|
107 |
+
|
108 |
+
with gr.Blocks() as demo:
|
109 |
+
system_prompt = gr.State(default_system_prompt)
|
110 |
+
messages = gr.State([SystemMessage(content=default_system_prompt)])
|
111 |
+
chat = gr.State(None)
|
112 |
+
|
113 |
+
with gr.Column(elem_id="col_container"):
|
114 |
+
gr.Markdown("# Welcome to OWN-GPT! π€")
|
115 |
+
gr.Markdown(
|
116 |
+
"Demo Chat Bot Platform"
|
117 |
+
)
|
118 |
+
|
119 |
+
chatbot = gr.Chatbot()
|
120 |
+
with gr.Column():
|
121 |
+
message = gr.Textbox(label="Type some message")
|
122 |
+
message.submit(
|
123 |
+
chat_start,
|
124 |
+
[chat, message, chatbot, messages],
|
125 |
+
[chat, message, chatbot, messages],
|
126 |
+
queue=True,
|
127 |
+
)
|
128 |
+
message_button = gr.Button("Submit", variant="primary")
|
129 |
+
message_button.click(
|
130 |
+
chat_start,
|
131 |
+
[chat, message, chatbot, messages],
|
132 |
+
[chat, message, chatbot, messages],
|
133 |
+
)
|
134 |
+
with gr.Column():
|
135 |
+
learning_status = gr.Textbox(label='Training Status')
|
136 |
+
url = gr.Textbox(label="URL to Documents")
|
137 |
+
file_format = gr.Textbox(label="Set your file format:", placeholder='Example: pdf, txt')
|
138 |
+
url.submit(
|
139 |
+
bot_learning,
|
140 |
+
[url, file_format],
|
141 |
+
[learning_status]
|
142 |
+
)
|
143 |
+
training_button = gr.Button("Training", variant="primary")
|
144 |
+
training_button.click(
|
145 |
+
bot_learning,
|
146 |
+
[url, file_format],
|
147 |
+
[learning_status]
|
148 |
+
)
|
149 |
+
with gr.Row():
|
150 |
+
with gr.Column():
|
151 |
+
clear_button = gr.Button("Clear")
|
152 |
+
clear_button.click(
|
153 |
+
on_clear_button_click,
|
154 |
+
[system_prompt],
|
155 |
+
[message, chatbot, messages],
|
156 |
+
queue=False,
|
157 |
+
)
|
158 |
+
with gr.Accordion("Settings", open=False):
|
159 |
+
model_name = gr.Dropdown(
|
160 |
+
choices=MODELS_NAMES, value=MODELS_NAMES[0], label="model"
|
161 |
+
)
|
162 |
+
temperature = gr.Slider(
|
163 |
+
minimum=0.0,
|
164 |
+
maximum=1.0,
|
165 |
+
value=0.7,
|
166 |
+
step=0.1,
|
167 |
+
label="temperature",
|
168 |
+
interactive=True,
|
169 |
+
)
|
170 |
+
apply_settings_button = gr.Button("Apply")
|
171 |
+
apply_settings_button.click(
|
172 |
+
on_apply_settings_button_click,
|
173 |
+
[system_prompt, model_name, temperature],
|
174 |
+
[chat, message, chatbot, messages],
|
175 |
+
)
|
176 |
+
|
177 |
+
with gr.Column():
|
178 |
+
system_prompt_area = gr.TextArea(
|
179 |
+
default_system_prompt, lines=4, label="prompt", interactive=True
|
180 |
+
)
|
181 |
+
system_prompt_area.input(
|
182 |
+
system_prompt_handler,
|
183 |
+
inputs=[system_prompt_area],
|
184 |
+
outputs=[system_prompt],
|
185 |
+
)
|
186 |
+
system_prompt_button = gr.Button("Set")
|
187 |
+
system_prompt_button.click(
|
188 |
+
on_apply_settings_button_click,
|
189 |
+
[system_prompt, model_name, temperature],
|
190 |
+
[chat, message, chatbot, messages],
|
191 |
+
)
|
192 |
+
|
193 |
+
demo.queue()
|
194 |
+
demo.launch()
|
bot/utils/callbacks.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from queue import Queue
|
2 |
+
from typing import Any
|
3 |
+
|
4 |
+
from langchain.callbacks.base import BaseCallbackHandler
|
5 |
+
|
6 |
+
|
7 |
+
class QueueCallback(BaseCallbackHandler):
|
8 |
+
"""Callback handler for streaming LLM responses to a queue."""
|
9 |
+
|
10 |
+
def __init__(self, queue: Queue):
|
11 |
+
self.queue = queue
|
12 |
+
|
13 |
+
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
|
14 |
+
self.queue.put(token)
|
15 |
+
|
16 |
+
def on_llm_end(self, *args, **kwargs: Any) -> None:
|
17 |
+
return self.queue.empty()
|
bot/utils/constanst.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
API_KEY = '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'
|
4 |
+
|
5 |
+
|
6 |
+
def set_api_key(api_key=API_KEY):
|
7 |
+
os.environ['OPENAI_API_KEY'] = api_key
|
8 |
+
return 'API KEY SUCCESSFULLY'
|
bot/utils/show_log.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
# Configure logging to display in terminal only
|
4 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
5 |
+
|
6 |
+
# Create a stream handler to output to the terminal
|
7 |
+
stream_handler = logging.StreamHandler()
|
8 |
+
stream_handler.setLevel(logging.INFO)
|
9 |
+
|
10 |
+
# Get the root logger and add the stream handler
|
11 |
+
logger = logging.getLogger()
|
12 |
+
logger.addHandler(stream_handler)
|
bot/web_scrapping/crawler_and_indexer.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
from urllib import request
|
3 |
+
from bot.web_scrapping.searchable_index import SearchableIndex
|
4 |
+
from bot.utils.show_log import logger
|
5 |
+
from bot.utils.constanst import set_api_key
|
6 |
+
import pandas as pd
|
7 |
+
import requests
|
8 |
+
import os
|
9 |
+
|
10 |
+
set_api_key(api_key='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')
|
11 |
+
|
12 |
+
|
13 |
+
def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
|
14 |
+
file_path = os.path.join(output_folder, f"combined_content.{file_format}")
|
15 |
+
if file_format == 'txt':
|
16 |
+
with open(f"{file_path}", "a", encoding="utf-8") as file:
|
17 |
+
for t in text:
|
18 |
+
file.write(f'{t.text}\n')
|
19 |
+
logger.info(f"Content appended to {file_path}")
|
20 |
+
elif file_format == 'pdf':
|
21 |
+
request.urlretrieve(url, file_path)
|
22 |
+
logger.info(f"Content appended to {file_path}")
|
23 |
+
elif file_format == 'csv':
|
24 |
+
df = pd.DataFrame({'Content': [t.text for t in text]})
|
25 |
+
df.to_csv(f"{file_path}", mode='a', index=False, header=False)
|
26 |
+
logger.info(f"Content appended to {file_path}")
|
27 |
+
elif file_format == 'xml':
|
28 |
+
xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
|
29 |
+
with open(f"{file_path}", "a", encoding="utf-8") as file:
|
30 |
+
file.write(xml_content)
|
31 |
+
logger.info(f"Content appended to {file_path}")
|
32 |
+
else:
|
33 |
+
logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")
|
34 |
+
return file_path
|
35 |
+
|
36 |
+
|
37 |
+
def content_crawler_and_index(url, file_format='txt', output_folder='learning_documents'):
|
38 |
+
if url != 'NO_URL':
|
39 |
+
# Send an HTTP GET request to the URL
|
40 |
+
responses = requests.get(url)
|
41 |
+
# Check if the request was successful
|
42 |
+
if responses.status_code == 200:
|
43 |
+
# Create output folder if it doesn't exist
|
44 |
+
if not os.path.exists(output_folder):
|
45 |
+
os.makedirs(output_folder)
|
46 |
+
# Parse the HTML content using BeautifulSoup
|
47 |
+
soup = BeautifulSoup(responses.text, "html.parser")
|
48 |
+
text = soup.find_all(['h2', 'p', 'i', 'ul'])
|
49 |
+
if text:
|
50 |
+
# Save content based on the specified file format
|
51 |
+
file_path = save_content_to_file(text=text, output_folder=output_folder, file_format=file_format)
|
52 |
+
|
53 |
+
# Create or update the index
|
54 |
+
index = SearchableIndex.embed_index(url, file_path)
|
55 |
+
if os.path.isfile(file_path):
|
56 |
+
os.remove(file_path)
|
57 |
+
return index
|
58 |
+
else:
|
59 |
+
file_path = save_content_to_file(url=url, output_folder=output_folder, file_format=file_format)
|
60 |
+
index = SearchableIndex.embed_index(url, file_path)
|
61 |
+
if os.path.isfile(file_path):
|
62 |
+
os.remove(file_path)
|
63 |
+
return index
|
64 |
+
|
65 |
+
else:
|
66 |
+
logger.warning("Failed to retrieve content from the URL.")
|
67 |
+
else:
|
68 |
+
index = SearchableIndex.embed_index(url=url, path=output_folder)
|
69 |
+
return index
|
70 |
+
|
71 |
+
|
72 |
+
if __name__ == '__main__':
|
73 |
+
pass
|
74 |
+
# Example usage:
|
75 |
+
# First URL
|
76 |
+
# idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
|
77 |
+
#
|
78 |
+
# Second URL (appends content to existing files)
|
79 |
+
# idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
|
80 |
+
# # example get response chatbot
|
81 |
+
# prompt = 'explain the paper'
|
82 |
+
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
83 |
+
# response = SearchableIndex.query(prompt, llm, idx)
|
84 |
+
# print(response)
|
85 |
+
# logger.info(response)
|
bot/web_scrapping/searchable_index.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import FAISS
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.document_loaders import (
|
4 |
+
PyPDFLoader,
|
5 |
+
DataFrameLoader,
|
6 |
+
)
|
7 |
+
from langchain.document_loaders.csv_loader import CSVLoader
|
8 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
9 |
+
from langchain.chains.retrieval_qa.base import RetrievalQA
|
10 |
+
from langchain.chat_models import ChatOpenAI
|
11 |
+
from bot.utils.show_log import logger
|
12 |
+
import pandas as pd
|
13 |
+
import threading
|
14 |
+
import glob
|
15 |
+
import os
|
16 |
+
import queue
|
17 |
+
|
18 |
+
|
19 |
+
class SearchableIndex:
|
20 |
+
def __init__(self, path):
|
21 |
+
self.path = path
|
22 |
+
|
23 |
+
def get_text_splits(self):
|
24 |
+
with open(self.path, 'r') as txt:
|
25 |
+
data = txt.read()
|
26 |
+
|
27 |
+
text_split = RecursiveCharacterTextSplitter(chunk_size=1000,
|
28 |
+
chunk_overlap=0,
|
29 |
+
length_function=len)
|
30 |
+
doc_list = text_split.split_text(data)
|
31 |
+
return doc_list
|
32 |
+
|
33 |
+
def get_pdf_splits(self):
|
34 |
+
loader = PyPDFLoader(self.path)
|
35 |
+
pages = loader.load_and_split()
|
36 |
+
text_split = RecursiveCharacterTextSplitter(chunk_size=1000,
|
37 |
+
chunk_overlap=0,
|
38 |
+
length_function=len)
|
39 |
+
doc_list = []
|
40 |
+
for pg in pages:
|
41 |
+
pg_splits = text_split.split_text(pg.page_content)
|
42 |
+
doc_list.extend(pg_splits)
|
43 |
+
return doc_list
|
44 |
+
|
45 |
+
def get_xml_splits(self, target_col, sheet_name):
|
46 |
+
df = pd.read_excel(io=self.path,
|
47 |
+
engine='openpyxl',
|
48 |
+
sheet_name=sheet_name)
|
49 |
+
|
50 |
+
df_loader = DataFrameLoader(df,
|
51 |
+
page_content_column=target_col)
|
52 |
+
|
53 |
+
excel_docs = df_loader.load()
|
54 |
+
|
55 |
+
return excel_docs
|
56 |
+
|
57 |
+
def get_csv_splits(self):
|
58 |
+
csv_loader = CSVLoader(self.path)
|
59 |
+
csv_docs = csv_loader.load()
|
60 |
+
return csv_docs
|
61 |
+
|
62 |
+
@classmethod
|
63 |
+
def merge_or_create_index(cls, index_store, faiss_db, embeddings, logger):
|
64 |
+
if os.path.exists(index_store):
|
65 |
+
local_db = FAISS.load_local(index_store, embeddings)
|
66 |
+
local_db.merge_from(faiss_db)
|
67 |
+
logger.info("Merge index completed")
|
68 |
+
local_db.save_local(index_store)
|
69 |
+
return local_db
|
70 |
+
else:
|
71 |
+
faiss_db.save_local(folder_path=index_store)
|
72 |
+
logger.info("New store created and loaded...")
|
73 |
+
local_db = FAISS.load_local(index_store, embeddings)
|
74 |
+
return local_db
|
75 |
+
|
76 |
+
@classmethod
|
77 |
+
def check_and_load_index(cls, index_files, embeddings, logger, path, result_queue):
|
78 |
+
if index_files:
|
79 |
+
local_db = FAISS.load_local(index_files[0], embeddings)
|
80 |
+
file_to_remove = os.path.join(path, 'combined_content.txt')
|
81 |
+
if os.path.exists(file_to_remove):
|
82 |
+
os.remove(file_to_remove)
|
83 |
+
else:
|
84 |
+
raise logger.warning("Index store does not exist")
|
85 |
+
result_queue.put(local_db) # Put the result in the queue
|
86 |
+
|
87 |
+
@classmethod
|
88 |
+
def embed_index(cls, url, path, target_col=None, sheet_name=None):
|
89 |
+
embeddings = OpenAIEmbeddings()
|
90 |
+
|
91 |
+
def process_docs(queues, extension):
|
92 |
+
nonlocal doc_list
|
93 |
+
instance = cls(path)
|
94 |
+
if extension == ".txt":
|
95 |
+
doc_list = instance.get_text_splits()
|
96 |
+
elif extension == ".pdf":
|
97 |
+
doc_list = instance.get_pdf_splits()
|
98 |
+
elif extension == ".xml":
|
99 |
+
doc_list = instance.get_xml_splits(target_col, sheet_name)
|
100 |
+
elif extension == ".csv":
|
101 |
+
doc_list = instance.get_csv_splits()
|
102 |
+
else:
|
103 |
+
doc_list = None
|
104 |
+
queues.put(doc_list)
|
105 |
+
|
106 |
+
if url != 'NO_URL' and path:
|
107 |
+
file_extension = os.path.splitext(path)[1].lower()
|
108 |
+
data_queue = queue.Queue()
|
109 |
+
thread = threading.Thread(target=process_docs, args=(data_queue, file_extension))
|
110 |
+
thread.start()
|
111 |
+
doc_list = data_queue.get()
|
112 |
+
if not doc_list:
|
113 |
+
raise ValueError("Unsupported file format")
|
114 |
+
|
115 |
+
faiss_db = FAISS.from_texts(doc_list, embeddings)
|
116 |
+
index_store = os.path.splitext(path)[0] + "_index"
|
117 |
+
local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
|
118 |
+
return local_db, index_store
|
119 |
+
elif url == 'NO_URL' and path:
|
120 |
+
index_files = glob.glob(os.path.join(path, '*_index'))
|
121 |
+
|
122 |
+
result_queue = queue.Queue() # Create a queue to store the result
|
123 |
+
|
124 |
+
thread = threading.Thread(target=cls.check_and_load_index,
|
125 |
+
args=(index_files, embeddings, logger, path, result_queue))
|
126 |
+
thread.start()
|
127 |
+
local_db = result_queue.get() # Retrieve the result from the queue
|
128 |
+
return local_db
|
129 |
+
|
130 |
+
@classmethod
|
131 |
+
def query(cls, question: str, llm, index):
|
132 |
+
"""Query the vectorstore."""
|
133 |
+
llm = llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
134 |
+
chain = RetrievalQA.from_chain_type(
|
135 |
+
llm, retriever=index.as_retriever()
|
136 |
+
)
|
137 |
+
return chain.run(question)
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == '__main__':
|
141 |
+
pass
|
142 |
+
# Examples for search query
|
143 |
+
# index = SearchableIndex.embed_index(
|
144 |
+
# path="/Users/macbook/Downloads/AI_test_exam/ChatBot/learning_documents/combined_content.txt")
|
145 |
+
# prompt = 'show more detail about types of data collected'
|
146 |
+
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
147 |
+
# result = SearchableIndex.query(prompt, llm=llm, index=index)
|
148 |
+
# print(result)
|
bot/web_scrapping/single_crawler.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import pandas as pd
|
5 |
+
from fpdf import FPDF
|
6 |
+
|
7 |
+
|
8 |
+
def content_crawler(url, file_format='txt', output_file='privacy_policy'):
|
9 |
+
# Send an HTTP GET request to the URL
|
10 |
+
response = requests.get(url)
|
11 |
+
|
12 |
+
# Check if the request was successful
|
13 |
+
if response.status_code == 200:
|
14 |
+
# Parse the HTML content using BeautifulSoup
|
15 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
16 |
+
text = soup.find_all(['h2', 'p', 'i', 'ul'])
|
17 |
+
|
18 |
+
# Create output folder if it doesn't exist
|
19 |
+
if not os.path.exists('../learning_documents'):
|
20 |
+
os.makedirs('../learning_documents')
|
21 |
+
|
22 |
+
# Save content based on the specified file format
|
23 |
+
output_path = os.path.join('../learning_documents', output_file)
|
24 |
+
|
25 |
+
if file_format == 'txt':
|
26 |
+
with open(f"{output_path}.txt", "w", encoding="utf-8") as file:
|
27 |
+
for t in text:
|
28 |
+
file.write(f'{t.text}\n')
|
29 |
+
print(f"Content saved to {output_path}.txt")
|
30 |
+
elif file_format == 'pdf':
|
31 |
+
pdf = FPDF()
|
32 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
33 |
+
pdf.add_page()
|
34 |
+
pdf.set_font("Arial", "B", 8)
|
35 |
+
for t in text:
|
36 |
+
pdf.cell(0, 10, t.text, ln=True)
|
37 |
+
pdf.output(f"{output_path}.pdf")
|
38 |
+
print(f"Content saved to {output_path}.pdf")
|
39 |
+
elif file_format == 'csv':
|
40 |
+
df = pd.DataFrame({'Content': [t.text for t in text]})
|
41 |
+
df.to_csv(f"{output_path}.csv", index=False)
|
42 |
+
print(f"Content saved to {output_path}.csv")
|
43 |
+
elif file_format == 'xml':
|
44 |
+
xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
|
45 |
+
with open(f"{output_path}.xml", "w", encoding="utf-8") as file:
|
46 |
+
file.write(f'<root>{xml_content}</root>')
|
47 |
+
print(f"Content saved to {output_path}.xml")
|
48 |
+
else:
|
49 |
+
print("Invalid file format. Supported formats: txt, pdf, csv, xml")
|
50 |
+
else:
|
51 |
+
print("Failed to retrieve content from the URL.")
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == '__main__':
|
55 |
+
pass
|
56 |
+
# Example usage:
|
57 |
+
# content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy')
|
requirements.txt
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohttp==3.8.6
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.1.2
|
5 |
+
annotated-types==0.6.0
|
6 |
+
anyio==3.7.1
|
7 |
+
async-timeout==4.0.3
|
8 |
+
attrs==23.1.0
|
9 |
+
backoff==2.2.1
|
10 |
+
bcrypt==4.0.1
|
11 |
+
beautifulsoup4==4.12.2
|
12 |
+
cachetools==5.3.2
|
13 |
+
certifi==2023.7.22
|
14 |
+
chardet==5.2.0
|
15 |
+
charset-normalizer==3.3.2
|
16 |
+
chroma-hnswlib==0.7.3
|
17 |
+
chromadb==0.4.16
|
18 |
+
ci-info==0.3.0
|
19 |
+
click==8.1.7
|
20 |
+
colorama==0.4.6
|
21 |
+
coloredlogs==15.0.1
|
22 |
+
configobj==5.0.8
|
23 |
+
configparser==6.0.0
|
24 |
+
contourpy==1.2.0
|
25 |
+
cycler==0.12.1
|
26 |
+
dataclasses-json==0.6.2
|
27 |
+
Deprecated==1.2.14
|
28 |
+
emoji==2.8.0
|
29 |
+
etelemetry==0.3.1
|
30 |
+
exceptiongroup==1.1.3
|
31 |
+
faiss-cpu==1.7.4
|
32 |
+
fastapi==0.104.1
|
33 |
+
ffmpy==0.3.1
|
34 |
+
filelock==3.13.1
|
35 |
+
filetype==1.2.0
|
36 |
+
flatbuffers==23.5.26
|
37 |
+
fonttools==4.44.0
|
38 |
+
fpdf==1.7.2
|
39 |
+
frozenlist==1.4.0
|
40 |
+
fsspec==2023.10.0
|
41 |
+
future==0.18.3
|
42 |
+
google-auth==2.23.4
|
43 |
+
googleapis-common-protos==1.61.0
|
44 |
+
gradio==3.45.2
|
45 |
+
gradio_client==0.5.3
|
46 |
+
grpcio==1.59.2
|
47 |
+
h11==0.14.0
|
48 |
+
httpcore==1.0.2
|
49 |
+
httplib2==0.22.0
|
50 |
+
httptools==0.6.1
|
51 |
+
httpx==0.25.1
|
52 |
+
huggingface-hub==0.17.3
|
53 |
+
humanfriendly==10.0
|
54 |
+
idna==3.4
|
55 |
+
importlib-metadata==6.8.0
|
56 |
+
importlib-resources==6.1.1
|
57 |
+
install==1.3.5
|
58 |
+
isodate==0.6.1
|
59 |
+
Jinja2==3.1.2
|
60 |
+
joblib==1.3.2
|
61 |
+
jsonpatch==1.33
|
62 |
+
jsonpointer==2.4
|
63 |
+
jsonschema==4.19.2
|
64 |
+
jsonschema-specifications==2023.7.1
|
65 |
+
kiwisolver==1.4.5
|
66 |
+
kubernetes==28.1.0
|
67 |
+
langchain==0.0.334
|
68 |
+
langdetect==1.0.9
|
69 |
+
langsmith==0.0.63
|
70 |
+
looseversion==1.3.0
|
71 |
+
lxml==4.9.3
|
72 |
+
markdown-it-py==3.0.0
|
73 |
+
MarkupSafe==2.1.3
|
74 |
+
marshmallow==3.20.1
|
75 |
+
matplotlib==3.8.1
|
76 |
+
mdurl==0.1.2
|
77 |
+
monotonic==1.6
|
78 |
+
mpmath==1.3.0
|
79 |
+
multidict==6.0.4
|
80 |
+
mypy-extensions==1.0.0
|
81 |
+
networkx==3.2.1
|
82 |
+
nibabel==5.1.0
|
83 |
+
nipype==1.8.6
|
84 |
+
nltk==3.8.1
|
85 |
+
numpy==1.26.1
|
86 |
+
oauthlib==3.2.2
|
87 |
+
onnxruntime==1.16.2
|
88 |
+
openai==0.27.3
|
89 |
+
opentelemetry-api==1.21.0
|
90 |
+
opentelemetry-exporter-otlp-proto-common==1.21.0
|
91 |
+
opentelemetry-exporter-otlp-proto-grpc==1.21.0
|
92 |
+
opentelemetry-proto==1.21.0
|
93 |
+
opentelemetry-sdk==1.21.0
|
94 |
+
opentelemetry-semantic-conventions==0.42b0
|
95 |
+
orjson==3.9.10
|
96 |
+
overrides==7.4.0
|
97 |
+
packaging==23.2
|
98 |
+
pandas==2.1.2
|
99 |
+
pathlib==1.0.1
|
100 |
+
pdfminer==20191125
|
101 |
+
Pillow==10.1.0
|
102 |
+
posthog==3.0.2
|
103 |
+
protobuf==4.25.0
|
104 |
+
prov==2.0.0
|
105 |
+
pulsar-client==3.3.0
|
106 |
+
pyasn1==0.5.0
|
107 |
+
pyasn1-modules==0.3.0
|
108 |
+
pycryptodome==3.19.0
|
109 |
+
pydantic==2.4.2
|
110 |
+
pydantic_core==2.10.1
|
111 |
+
pydot==1.4.2
|
112 |
+
pydub==0.25.1
|
113 |
+
Pygments==2.16.1
|
114 |
+
pyparsing==3.1.1
|
115 |
+
pypdf==3.17.0
|
116 |
+
PyPDF2==3.0.1
|
117 |
+
PyPika==0.48.9
|
118 |
+
python-dateutil==2.8.2
|
119 |
+
python-dotenv==1.0.0
|
120 |
+
python-iso639==2023.6.15
|
121 |
+
python-magic==0.4.27
|
122 |
+
python-multipart==0.0.6
|
123 |
+
pytz==2023.3.post1
|
124 |
+
pyxnat==1.6
|
125 |
+
PyYAML==6.0.1
|
126 |
+
rapidfuzz==3.5.2
|
127 |
+
rdflib==7.0.0
|
128 |
+
referencing==0.30.2
|
129 |
+
regex==2023.10.3
|
130 |
+
requests==2.31.0
|
131 |
+
requests-oauthlib==1.3.1
|
132 |
+
rich==13.6.0
|
133 |
+
rpds-py==0.12.0
|
134 |
+
rsa==4.9
|
135 |
+
scipy==1.11.3
|
136 |
+
semantic-version==2.10.0
|
137 |
+
shellingham==1.5.4
|
138 |
+
simplejson==3.19.2
|
139 |
+
six==1.16.0
|
140 |
+
sniffio==1.3.0
|
141 |
+
soupsieve==2.5
|
142 |
+
SQLAlchemy==2.0.23
|
143 |
+
starlette==0.27.0
|
144 |
+
sympy==1.12
|
145 |
+
tabulate==0.9.0
|
146 |
+
tenacity==8.2.3
|
147 |
+
tiktoken==0.5.1
|
148 |
+
tokenizers==0.14.1
|
149 |
+
tomlkit==0.12.0
|
150 |
+
toolz==0.12.0
|
151 |
+
tqdm==4.66.1
|
152 |
+
traits==6.3.2
|
153 |
+
typer==0.9.0
|
154 |
+
typing-inspect==0.9.0
|
155 |
+
typing_extensions==4.8.0
|
156 |
+
tzdata==2023.3
|
157 |
+
unstructured==0.10.29
|
158 |
+
urllib3==1.26.18
|
159 |
+
uvicorn==0.24.0.post1
|
160 |
+
uvloop==0.19.0
|
161 |
+
watchfiles==0.21.0
|
162 |
+
websocket-client==1.6.4
|
163 |
+
websockets==11.0.3
|
164 |
+
wrapt==1.16.0
|
165 |
+
yarl==1.9.2
|
166 |
+
zipp==3.17.0
|