frascuchon HF Staff commited on
Commit
a854edf
·
1 Parent(s): 398adcb

add source code

Browse files
Files changed (5) hide show
  1. app.py +47 -0
  2. chunker.py +52 -0
  3. rag.py +206 -0
  4. scrape.py +28 -0
  5. web_search.py +36 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from string import Template
3
+
4
+ from rag import search
5
+
6
+
7
+ async def rag_search(query: str) -> str:
8
+ """
9
+ Search for information based on a query.
10
+
11
+ Args:
12
+ query (str): The search query
13
+
14
+ Returns:
15
+ str: A message indicating the search result
16
+ """
17
+ results = await search(query, top_k=5)
18
+
19
+ source_template = Template(
20
+ '''
21
+ ### Source
22
+ $source
23
+
24
+ ### Content
25
+ $content
26
+ '''
27
+ )
28
+
29
+ return f"## Results for query {query}\n" + "\n".join(
30
+ source_template.substitute(
31
+ source=result['metadata']['source'],
32
+ content=result['content']
33
+ ) for result in results
34
+ ) if results else "No results found."
35
+
36
+
37
+ demo = gr.TabbedInterface(
38
+ [
39
+ gr.Interface(rag_search, gr.Textbox(), gr.Textbox(), api_name="search_content", title="RAG Search"),
40
+ ],
41
+ [
42
+ "Search content",
43
+ ]
44
+ )
45
+
46
+ if __name__ == "__main__":
47
+ demo.launch(mcp_server=True)
chunker.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from llama_index.core import Document
4
+ from llama_index.core.node_parser import SentenceSplitter
5
+
6
+
7
+ def llama_index_sentence_splitter(
8
+ documents: list[str],
9
+ document_ids: list[str],
10
+ chunk_size: int = 256,
11
+ skip_chunks_threshold: int = 30
12
+
13
+ ) -> list[dict]:
14
+ """
15
+ Split documents into chunks using the SentenceSplitter from LlamaIndex.
16
+
17
+ Args:
18
+ documents (list[str]): List of documents to be split into chunks.
19
+ document_ids (list[str]): List of document IDs corresponding to the documents.
20
+ chunk_size (int): Size of each chunk. Default is 256.
21
+ skip_chunks_threshold (int): Minimum length of text in a chunk to be included. Default is 30.
22
+
23
+ Returns:
24
+ list[dict]: A list of dictionaries, each containing a document ID and its corresponding content chunk.
25
+ """
26
+
27
+ def remove_spaces_and_newlines(text: str) -> str:
28
+ """
29
+ Remove spaces and newlines from the text.
30
+ """
31
+ return ' '.join(text.split())
32
+
33
+ chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
34
+ node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
35
+
36
+ chunks = []
37
+ docs = [[Document(text=doc)] for doc in documents]
38
+ for doc_id, doc in zip(document_ids, docs):
39
+
40
+ texts = [node.text for node in node_parser(doc)]
41
+
42
+ # Filter out chunks that do not contain enough text
43
+ texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
44
+
45
+ if not texts:
46
+ continue
47
+
48
+ chunks += [
49
+ {"document_id": doc_id, "content": text}
50
+ for text in texts
51
+ ]
52
+ return chunks
rag.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ import uuid
3
+ import os
4
+
5
+ from chunker import llama_index_sentence_splitter
6
+ from scrape import get_url_content
7
+ from web_search import google
8
+
9
+ client = chromadb.Client()
10
+
11
+ COLLECTION_NAME = str.strip(os.getenv("RAG_INDEX", "default_index")) or "default_index"
12
+
13
+
14
+ async def search(query: str, top_k: int = 5) -> list:
15
+ """
16
+ Search the ChromaDB collection for documents similar to the query.
17
+
18
+ Arguments:
19
+ query (str): The search query.
20
+ top_k (int): The number of top results to return.
21
+
22
+ Returns:
23
+ list: A list of dictionaries containing the search results, including documents and metadata.
24
+
25
+ """
26
+ print("Searching ChromaDB collection for documents similar to the query.")
27
+
28
+ if not query:
29
+ raise ValueError("Query cannot be empty.")
30
+
31
+ web_search = await google(q=query, results=2)
32
+
33
+ _index_links([result["link"] for result in web_search["organic"]])
34
+
35
+ results = _search_k(query, top_k)
36
+ print(f"Found {len(results['documents'])} documents matching the query.")
37
+
38
+ return [
39
+ {
40
+ "content": doc,
41
+ "distance": distance,
42
+ "metadata": metadata,
43
+ }
44
+ for i, (doc, metadata, distance) in
45
+ enumerate(zip(results["documents"], results["metadatas"], results["distances"]))
46
+ ]
47
+
48
+
49
+ def _index_links(links: list) -> int:
50
+ """
51
+ Index a list of URLs by adding their content to the ChromaDB collection.
52
+
53
+ Arguments:
54
+ links (list): A list of URLs to index.
55
+
56
+ Returns:
57
+ int: The total number of chunks added to the collection.
58
+ """
59
+ from concurrent.futures import ThreadPoolExecutor
60
+
61
+ print("Indexing multiple URLs:", links)
62
+
63
+ with ThreadPoolExecutor() as executor:
64
+ tasks = [lambda link=link: _index_url(link) for link in links]
65
+ running_tasks = [executor.submit(task) for task in tasks]
66
+ for running_task in running_tasks:
67
+ running_task.result()
68
+
69
+ total_chunks = sum(task.result() for task in running_tasks)
70
+ print(f"Total chunks indexed from {len(links)} URLs: {total_chunks}")
71
+ return total_chunks
72
+
73
+
74
+ def _url_exists(url: str) -> bool:
75
+ """
76
+ Check if a URL is already indexed in the ChromaDB collection.
77
+
78
+ Arguments:
79
+ url (str): The URL to check.
80
+
81
+ Returns:
82
+ bool: True if the URL is indexed, False otherwise.
83
+ """
84
+ print("Checking if URL exists in the collection:", url)
85
+ collection = _get_collection()
86
+
87
+ # Check if the document with the given source exists
88
+ exists = len(collection.get(
89
+ where={"source": url},
90
+ limit=1,
91
+ include=["documents"]
92
+ ).get("documents", [])) > 0
93
+
94
+ print(f"URL {url} exists: {exists}")
95
+
96
+ return exists
97
+
98
+
99
+ def _index_url(url: str) -> int:
100
+ """
101
+ Index a URL by adding its content to the ChromaDB collection.
102
+
103
+ Arguments:
104
+ url (str): The URL to index.
105
+
106
+ Returns:
107
+ int: The total number of chunks added to the collection.
108
+ """
109
+ print("Indexing URL", url)
110
+ if _url_exists(url):
111
+ print(f"URL {url} is already indexed. Skipping indexing.")
112
+ return 0
113
+
114
+ document = get_url_content(url)
115
+
116
+ if not document:
117
+ print("No content found at the provided URL.")
118
+ return 0
119
+
120
+ total_chunks = _add_document_to_collection(document, url)
121
+ print(f"Indexed {total_chunks} chunks from URL: {url}")
122
+
123
+ return total_chunks
124
+
125
+
126
+ def _get_collection() -> "chromadb.Collection":
127
+ """
128
+ Get the collection from the ChromaDB client.
129
+
130
+ :return: The collection object.
131
+ """
132
+ collection = client.get_or_create_collection(COLLECTION_NAME)
133
+
134
+ print(f"Using collection: {COLLECTION_NAME} with {collection.count()} indexed chunks")
135
+
136
+ return collection
137
+
138
+
139
+ def _add_document_to_collection(document: str, source: str):
140
+ """
141
+ Adds a document to the ChromaDB collection.
142
+
143
+ Args:
144
+ document (str): The content of the document to be added.
145
+ source (str): The source URI of the document.s
146
+
147
+ Returns:
148
+ The upserted document with its metadata in the collection.
149
+ """
150
+ collection = _get_collection()
151
+
152
+ document_chunks = llama_index_sentence_splitter(
153
+ documents=[document],
154
+ document_ids=[source],
155
+ )
156
+
157
+ if not document_chunks:
158
+ print("No document chunks were created. Please check the input document.")
159
+ return 0
160
+
161
+ collection.upsert(
162
+ ids=[str(uuid.uuid4().hex) for _ in document_chunks],
163
+ documents=[chunk["content"] for chunk in document_chunks],
164
+ metadatas=[
165
+ {"source": source, "chunk_id": i}
166
+ for i in range(0, len(document_chunks))
167
+ ],
168
+ )
169
+ return len(document_chunks)
170
+
171
+
172
+ def _search_k(query: str, k: int = 5):
173
+ """
174
+ Search the ChromaDB collection for the top k documents matching the query.
175
+
176
+ Arguments:
177
+ query (str): The search query.
178
+ k (int): The number of top results to return.
179
+
180
+ Returns:
181
+ dict: A dictionary containing the search results, including documents and metadata.
182
+
183
+ """
184
+ collection = _get_collection()
185
+
186
+ results = collection.query(
187
+ query_texts=[query],
188
+ n_results=k,
189
+ include=["documents", "metadatas", "distances"],
190
+ )
191
+
192
+ if not results or not results.get("documents"):
193
+ print("No results found for the query.")
194
+ return {
195
+ "documents": [],
196
+ "metadatas": [],
197
+ "distances": []
198
+ }
199
+
200
+ query_results = {
201
+ "documents": results["documents"][0],
202
+ "metadatas": results["metadatas"][0],
203
+ "distances": results["distances"][0]
204
+ }
205
+
206
+ return query_results
scrape.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+
5
+ def get_url_content(url: str) -> str:
6
+ """
7
+ Retrieve the content of a URL.
8
+
9
+ :param url: The URL to retrieve content from.
10
+ :return: The content of the URL as a string.
11
+ """
12
+
13
+ response = requests.get(
14
+ url,
15
+ headers={
16
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
17
+ }
18
+ )
19
+
20
+ if response.status_code != 200:
21
+ print(f"Failed to retrieve content from {url}. Status code: {response.status_code} - {response.reason}")
22
+ return ""
23
+
24
+ # parse the html content using BeautifulSoup
25
+ parser = BeautifulSoup(response.text, 'html.parser')
26
+
27
+ # extract text from the parsed HTML
28
+ return parser.text.strip() if parser.text else ""
web_search.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ import certifi
3
+ import os
4
+ import ssl
5
+ from typing import Dict, Any
6
+
7
+ SERPER_API_KEY = str.strip(os.getenv("SERPER_API_KEY", ""))
8
+ AIOHTTP_TIMEOUT = int(os.getenv("AIOHTTP_TIMEOUT", "15"))
9
+
10
+ if not SERPER_API_KEY:
11
+ raise ValueError("SERPER_API_KEY environment variable is not set.")
12
+
13
+
14
+ async def google(q: str, results: int = 5) -> Dict[str, Any]:
15
+ url = "https://google.serper.dev/search"
16
+ return await fetch_json(url, {
17
+ "q": q,
18
+ "num": results,
19
+ "page": 1,
20
+ })
21
+
22
+
23
+ async def fetch_json(url: str, payload: dict) -> Dict[str, Any]:
24
+ headers = {
25
+ 'X-API-KEY': SERPER_API_KEY,
26
+ 'Content-Type': 'application/json'
27
+ }
28
+
29
+ ssl_context = ssl.create_default_context(cafile=certifi.where())
30
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
31
+
32
+ timeout = aiohttp.ClientTimeout(total=AIOHTTP_TIMEOUT)
33
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
34
+ async with session.post(url, headers=headers, json=payload) as response:
35
+ response.raise_for_status()
36
+ return await response.json()