Spaces:
Running
Running
Commit
·
a854edf
1
Parent(s):
398adcb
add source code
Browse files- app.py +47 -0
- chunker.py +52 -0
- rag.py +206 -0
- scrape.py +28 -0
- web_search.py +36 -0
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from string import Template
|
3 |
+
|
4 |
+
from rag import search
|
5 |
+
|
6 |
+
|
7 |
+
async def rag_search(query: str) -> str:
|
8 |
+
"""
|
9 |
+
Search for information based on a query.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
query (str): The search query
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
str: A message indicating the search result
|
16 |
+
"""
|
17 |
+
results = await search(query, top_k=5)
|
18 |
+
|
19 |
+
source_template = Template(
|
20 |
+
'''
|
21 |
+
### Source
|
22 |
+
$source
|
23 |
+
|
24 |
+
### Content
|
25 |
+
$content
|
26 |
+
'''
|
27 |
+
)
|
28 |
+
|
29 |
+
return f"## Results for query {query}\n" + "\n".join(
|
30 |
+
source_template.substitute(
|
31 |
+
source=result['metadata']['source'],
|
32 |
+
content=result['content']
|
33 |
+
) for result in results
|
34 |
+
) if results else "No results found."
|
35 |
+
|
36 |
+
|
37 |
+
demo = gr.TabbedInterface(
|
38 |
+
[
|
39 |
+
gr.Interface(rag_search, gr.Textbox(), gr.Textbox(), api_name="search_content", title="RAG Search"),
|
40 |
+
],
|
41 |
+
[
|
42 |
+
"Search content",
|
43 |
+
]
|
44 |
+
)
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
demo.launch(mcp_server=True)
|
chunker.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from llama_index.core import Document
|
4 |
+
from llama_index.core.node_parser import SentenceSplitter
|
5 |
+
|
6 |
+
|
7 |
+
def llama_index_sentence_splitter(
|
8 |
+
documents: list[str],
|
9 |
+
document_ids: list[str],
|
10 |
+
chunk_size: int = 256,
|
11 |
+
skip_chunks_threshold: int = 30
|
12 |
+
|
13 |
+
) -> list[dict]:
|
14 |
+
"""
|
15 |
+
Split documents into chunks using the SentenceSplitter from LlamaIndex.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
documents (list[str]): List of documents to be split into chunks.
|
19 |
+
document_ids (list[str]): List of document IDs corresponding to the documents.
|
20 |
+
chunk_size (int): Size of each chunk. Default is 256.
|
21 |
+
skip_chunks_threshold (int): Minimum length of text in a chunk to be included. Default is 30.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
list[dict]: A list of dictionaries, each containing a document ID and its corresponding content chunk.
|
25 |
+
"""
|
26 |
+
|
27 |
+
def remove_spaces_and_newlines(text: str) -> str:
|
28 |
+
"""
|
29 |
+
Remove spaces and newlines from the text.
|
30 |
+
"""
|
31 |
+
return ' '.join(text.split())
|
32 |
+
|
33 |
+
chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
|
34 |
+
node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
|
35 |
+
|
36 |
+
chunks = []
|
37 |
+
docs = [[Document(text=doc)] for doc in documents]
|
38 |
+
for doc_id, doc in zip(document_ids, docs):
|
39 |
+
|
40 |
+
texts = [node.text for node in node_parser(doc)]
|
41 |
+
|
42 |
+
# Filter out chunks that do not contain enough text
|
43 |
+
texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
|
44 |
+
|
45 |
+
if not texts:
|
46 |
+
continue
|
47 |
+
|
48 |
+
chunks += [
|
49 |
+
{"document_id": doc_id, "content": text}
|
50 |
+
for text in texts
|
51 |
+
]
|
52 |
+
return chunks
|
rag.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
import uuid
|
3 |
+
import os
|
4 |
+
|
5 |
+
from chunker import llama_index_sentence_splitter
|
6 |
+
from scrape import get_url_content
|
7 |
+
from web_search import google
|
8 |
+
|
9 |
+
client = chromadb.Client()
|
10 |
+
|
11 |
+
COLLECTION_NAME = str.strip(os.getenv("RAG_INDEX", "default_index")) or "default_index"
|
12 |
+
|
13 |
+
|
14 |
+
async def search(query: str, top_k: int = 5) -> list:
|
15 |
+
"""
|
16 |
+
Search the ChromaDB collection for documents similar to the query.
|
17 |
+
|
18 |
+
Arguments:
|
19 |
+
query (str): The search query.
|
20 |
+
top_k (int): The number of top results to return.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
list: A list of dictionaries containing the search results, including documents and metadata.
|
24 |
+
|
25 |
+
"""
|
26 |
+
print("Searching ChromaDB collection for documents similar to the query.")
|
27 |
+
|
28 |
+
if not query:
|
29 |
+
raise ValueError("Query cannot be empty.")
|
30 |
+
|
31 |
+
web_search = await google(q=query, results=2)
|
32 |
+
|
33 |
+
_index_links([result["link"] for result in web_search["organic"]])
|
34 |
+
|
35 |
+
results = _search_k(query, top_k)
|
36 |
+
print(f"Found {len(results['documents'])} documents matching the query.")
|
37 |
+
|
38 |
+
return [
|
39 |
+
{
|
40 |
+
"content": doc,
|
41 |
+
"distance": distance,
|
42 |
+
"metadata": metadata,
|
43 |
+
}
|
44 |
+
for i, (doc, metadata, distance) in
|
45 |
+
enumerate(zip(results["documents"], results["metadatas"], results["distances"]))
|
46 |
+
]
|
47 |
+
|
48 |
+
|
49 |
+
def _index_links(links: list) -> int:
|
50 |
+
"""
|
51 |
+
Index a list of URLs by adding their content to the ChromaDB collection.
|
52 |
+
|
53 |
+
Arguments:
|
54 |
+
links (list): A list of URLs to index.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
int: The total number of chunks added to the collection.
|
58 |
+
"""
|
59 |
+
from concurrent.futures import ThreadPoolExecutor
|
60 |
+
|
61 |
+
print("Indexing multiple URLs:", links)
|
62 |
+
|
63 |
+
with ThreadPoolExecutor() as executor:
|
64 |
+
tasks = [lambda link=link: _index_url(link) for link in links]
|
65 |
+
running_tasks = [executor.submit(task) for task in tasks]
|
66 |
+
for running_task in running_tasks:
|
67 |
+
running_task.result()
|
68 |
+
|
69 |
+
total_chunks = sum(task.result() for task in running_tasks)
|
70 |
+
print(f"Total chunks indexed from {len(links)} URLs: {total_chunks}")
|
71 |
+
return total_chunks
|
72 |
+
|
73 |
+
|
74 |
+
def _url_exists(url: str) -> bool:
|
75 |
+
"""
|
76 |
+
Check if a URL is already indexed in the ChromaDB collection.
|
77 |
+
|
78 |
+
Arguments:
|
79 |
+
url (str): The URL to check.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
bool: True if the URL is indexed, False otherwise.
|
83 |
+
"""
|
84 |
+
print("Checking if URL exists in the collection:", url)
|
85 |
+
collection = _get_collection()
|
86 |
+
|
87 |
+
# Check if the document with the given source exists
|
88 |
+
exists = len(collection.get(
|
89 |
+
where={"source": url},
|
90 |
+
limit=1,
|
91 |
+
include=["documents"]
|
92 |
+
).get("documents", [])) > 0
|
93 |
+
|
94 |
+
print(f"URL {url} exists: {exists}")
|
95 |
+
|
96 |
+
return exists
|
97 |
+
|
98 |
+
|
99 |
+
def _index_url(url: str) -> int:
|
100 |
+
"""
|
101 |
+
Index a URL by adding its content to the ChromaDB collection.
|
102 |
+
|
103 |
+
Arguments:
|
104 |
+
url (str): The URL to index.
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
int: The total number of chunks added to the collection.
|
108 |
+
"""
|
109 |
+
print("Indexing URL", url)
|
110 |
+
if _url_exists(url):
|
111 |
+
print(f"URL {url} is already indexed. Skipping indexing.")
|
112 |
+
return 0
|
113 |
+
|
114 |
+
document = get_url_content(url)
|
115 |
+
|
116 |
+
if not document:
|
117 |
+
print("No content found at the provided URL.")
|
118 |
+
return 0
|
119 |
+
|
120 |
+
total_chunks = _add_document_to_collection(document, url)
|
121 |
+
print(f"Indexed {total_chunks} chunks from URL: {url}")
|
122 |
+
|
123 |
+
return total_chunks
|
124 |
+
|
125 |
+
|
126 |
+
def _get_collection() -> "chromadb.Collection":
|
127 |
+
"""
|
128 |
+
Get the collection from the ChromaDB client.
|
129 |
+
|
130 |
+
:return: The collection object.
|
131 |
+
"""
|
132 |
+
collection = client.get_or_create_collection(COLLECTION_NAME)
|
133 |
+
|
134 |
+
print(f"Using collection: {COLLECTION_NAME} with {collection.count()} indexed chunks")
|
135 |
+
|
136 |
+
return collection
|
137 |
+
|
138 |
+
|
139 |
+
def _add_document_to_collection(document: str, source: str):
|
140 |
+
"""
|
141 |
+
Adds a document to the ChromaDB collection.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
document (str): The content of the document to be added.
|
145 |
+
source (str): The source URI of the document.s
|
146 |
+
|
147 |
+
Returns:
|
148 |
+
The upserted document with its metadata in the collection.
|
149 |
+
"""
|
150 |
+
collection = _get_collection()
|
151 |
+
|
152 |
+
document_chunks = llama_index_sentence_splitter(
|
153 |
+
documents=[document],
|
154 |
+
document_ids=[source],
|
155 |
+
)
|
156 |
+
|
157 |
+
if not document_chunks:
|
158 |
+
print("No document chunks were created. Please check the input document.")
|
159 |
+
return 0
|
160 |
+
|
161 |
+
collection.upsert(
|
162 |
+
ids=[str(uuid.uuid4().hex) for _ in document_chunks],
|
163 |
+
documents=[chunk["content"] for chunk in document_chunks],
|
164 |
+
metadatas=[
|
165 |
+
{"source": source, "chunk_id": i}
|
166 |
+
for i in range(0, len(document_chunks))
|
167 |
+
],
|
168 |
+
)
|
169 |
+
return len(document_chunks)
|
170 |
+
|
171 |
+
|
172 |
+
def _search_k(query: str, k: int = 5):
|
173 |
+
"""
|
174 |
+
Search the ChromaDB collection for the top k documents matching the query.
|
175 |
+
|
176 |
+
Arguments:
|
177 |
+
query (str): The search query.
|
178 |
+
k (int): The number of top results to return.
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
dict: A dictionary containing the search results, including documents and metadata.
|
182 |
+
|
183 |
+
"""
|
184 |
+
collection = _get_collection()
|
185 |
+
|
186 |
+
results = collection.query(
|
187 |
+
query_texts=[query],
|
188 |
+
n_results=k,
|
189 |
+
include=["documents", "metadatas", "distances"],
|
190 |
+
)
|
191 |
+
|
192 |
+
if not results or not results.get("documents"):
|
193 |
+
print("No results found for the query.")
|
194 |
+
return {
|
195 |
+
"documents": [],
|
196 |
+
"metadatas": [],
|
197 |
+
"distances": []
|
198 |
+
}
|
199 |
+
|
200 |
+
query_results = {
|
201 |
+
"documents": results["documents"][0],
|
202 |
+
"metadatas": results["metadatas"][0],
|
203 |
+
"distances": results["distances"][0]
|
204 |
+
}
|
205 |
+
|
206 |
+
return query_results
|
scrape.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
|
5 |
+
def get_url_content(url: str) -> str:
|
6 |
+
"""
|
7 |
+
Retrieve the content of a URL.
|
8 |
+
|
9 |
+
:param url: The URL to retrieve content from.
|
10 |
+
:return: The content of the URL as a string.
|
11 |
+
"""
|
12 |
+
|
13 |
+
response = requests.get(
|
14 |
+
url,
|
15 |
+
headers={
|
16 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
17 |
+
}
|
18 |
+
)
|
19 |
+
|
20 |
+
if response.status_code != 200:
|
21 |
+
print(f"Failed to retrieve content from {url}. Status code: {response.status_code} - {response.reason}")
|
22 |
+
return ""
|
23 |
+
|
24 |
+
# parse the html content using BeautifulSoup
|
25 |
+
parser = BeautifulSoup(response.text, 'html.parser')
|
26 |
+
|
27 |
+
# extract text from the parsed HTML
|
28 |
+
return parser.text.strip() if parser.text else ""
|
web_search.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import aiohttp
|
2 |
+
import certifi
|
3 |
+
import os
|
4 |
+
import ssl
|
5 |
+
from typing import Dict, Any
|
6 |
+
|
7 |
+
SERPER_API_KEY = str.strip(os.getenv("SERPER_API_KEY", ""))
|
8 |
+
AIOHTTP_TIMEOUT = int(os.getenv("AIOHTTP_TIMEOUT", "15"))
|
9 |
+
|
10 |
+
if not SERPER_API_KEY:
|
11 |
+
raise ValueError("SERPER_API_KEY environment variable is not set.")
|
12 |
+
|
13 |
+
|
14 |
+
async def google(q: str, results: int = 5) -> Dict[str, Any]:
|
15 |
+
url = "https://google.serper.dev/search"
|
16 |
+
return await fetch_json(url, {
|
17 |
+
"q": q,
|
18 |
+
"num": results,
|
19 |
+
"page": 1,
|
20 |
+
})
|
21 |
+
|
22 |
+
|
23 |
+
async def fetch_json(url: str, payload: dict) -> Dict[str, Any]:
|
24 |
+
headers = {
|
25 |
+
'X-API-KEY': SERPER_API_KEY,
|
26 |
+
'Content-Type': 'application/json'
|
27 |
+
}
|
28 |
+
|
29 |
+
ssl_context = ssl.create_default_context(cafile=certifi.where())
|
30 |
+
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
31 |
+
|
32 |
+
timeout = aiohttp.ClientTimeout(total=AIOHTTP_TIMEOUT)
|
33 |
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
34 |
+
async with session.post(url, headers=headers, json=payload) as response:
|
35 |
+
response.raise_for_status()
|
36 |
+
return await response.json()
|