Spaces:

frascuchon
/

rag-mcp-server

Running

App Files Files Community

frascuchon HF Staff commited on 8 days ago

Commit

a854edf

1 Parent(s): 398adcb

add source code

Browse files

Files changed (5) hide show

app.py +47 -0
chunker.py +52 -0
rag.py +206 -0
scrape.py +28 -0
web_search.py +36 -0

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gradio as gr
+from string import Template
+from rag import search
+async def rag_search(query: str) -> str:
+    """
+    Search for information based on a query.
+    Args:
+        query (str): The search query
+    Returns:
+        str: A message indicating the search result
+    """
+    results = await search(query, top_k=5)
+    source_template = Template(
+        '''
+        ### Source
+        $source
+        ### Content
+        $content
+        '''
+    )
+    return f"## Results for query {query}\n" + "\n".join(
+        source_template.substitute(
+            source=result['metadata']['source'],
+            content=result['content']
+        ) for result in results
+    ) if results else "No results found."
+demo = gr.TabbedInterface(
+    [
+        gr.Interface(rag_search, gr.Textbox(), gr.Textbox(), api_name="search_content", title="RAG Search"),
+    ],
+    [
+        "Search content",
+    ]
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

chunker.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import requests
+from bs4 import BeautifulSoup
+from llama_index.core import Document
+from llama_index.core.node_parser import SentenceSplitter
+def llama_index_sentence_splitter(
+    documents: list[str],
+    document_ids: list[str],
+    chunk_size: int = 256,
+    skip_chunks_threshold: int = 30
+) -> list[dict]:
+    """
+    Split documents into chunks using the SentenceSplitter from LlamaIndex.
+    Args:
+        documents (list[str]): List of documents to be split into chunks.
+        document_ids (list[str]): List of document IDs corresponding to the documents.
+        chunk_size (int): Size of each chunk. Default is 256.
+        skip_chunks_threshold (int): Minimum length of text in a chunk to be included. Default is 30.
+    Returns:
+        list[dict]: A list of dictionaries, each containing a document ID and its corresponding content chunk.
+    """
+    def remove_spaces_and_newlines(text: str) -> str:
+        """
+        Remove spaces and newlines from the text.
+        """
+        return ' '.join(text.split())
+    chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
+    node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
+    chunks = []
+    docs = [[Document(text=doc)] for doc in documents]
+    for doc_id, doc in zip(document_ids, docs):
+        texts = [node.text for node in node_parser(doc)]
+        # Filter out chunks that do not contain enough text
+        texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
+        if not texts:
+            continue
+        chunks += [
+            {"document_id": doc_id, "content": text}
+            for text in texts
+        ]
+    return chunks

rag.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import chromadb
+import uuid
+import os
+from chunker import llama_index_sentence_splitter
+from scrape import get_url_content
+from web_search import google
+client = chromadb.Client()
+COLLECTION_NAME = str.strip(os.getenv("RAG_INDEX", "default_index")) or "default_index"
+async def search(query: str, top_k: int = 5) -> list:
+    """
+    Search the ChromaDB collection for documents similar to the query.
+    Arguments:
+        query (str): The search query.
+        top_k (int): The number of top results to return.
+    Returns:
+        list: A list of dictionaries containing the search results, including documents and metadata.
+    """
+    print("Searching ChromaDB collection for documents similar to the query.")
+    if not query:
+        raise ValueError("Query cannot be empty.")
+    web_search = await google(q=query, results=2)
+    _index_links([result["link"] for result in web_search["organic"]])
+    results = _search_k(query, top_k)
+    print(f"Found {len(results['documents'])} documents matching the query.")
+    return [
+        {
+            "content": doc,
+            "distance": distance,
+            "metadata": metadata,
+        }
+        for i, (doc, metadata, distance) in
+        enumerate(zip(results["documents"], results["metadatas"], results["distances"]))
+    ]
+def _index_links(links: list) -> int:
+    """
+    Index a list of URLs by adding their content to the ChromaDB collection.
+    Arguments:
+        links (list): A list of URLs to index.
+    Returns:
+        int: The total number of chunks added to the collection.
+    """
+    from concurrent.futures import ThreadPoolExecutor
+    print("Indexing multiple URLs:", links)
+    with ThreadPoolExecutor() as executor:
+        tasks = [lambda link=link: _index_url(link) for link in links]
+        running_tasks = [executor.submit(task) for task in tasks]
+        for running_task in running_tasks:
+            running_task.result()
+    total_chunks = sum(task.result() for task in running_tasks)
+    print(f"Total chunks indexed from {len(links)} URLs: {total_chunks}")
+    return total_chunks
+def _url_exists(url: str) -> bool:
+    """
+    Check if a URL is already indexed in the ChromaDB collection.
+    Arguments:
+        url (str): The URL to check.
+    Returns:
+        bool: True if the URL is indexed, False otherwise.
+    """
+    print("Checking if URL exists in the collection:", url)
+    collection = _get_collection()
+    # Check if the document with the given source exists
+    exists = len(collection.get(
+        where={"source": url},
+        limit=1,
+        include=["documents"]
+    ).get("documents", [])) > 0
+    print(f"URL {url} exists: {exists}")
+    return exists
+def _index_url(url: str) -> int:
+    """
+    Index a URL by adding its content to the ChromaDB collection.
+    Arguments:
+        url (str): The URL to index.
+    Returns:
+        int: The total number of chunks added to the collection.
+    """
+    print("Indexing URL", url)
+    if _url_exists(url):
+        print(f"URL {url} is already indexed. Skipping indexing.")
+        return 0
+    document = get_url_content(url)
+    if not document:
+        print("No content found at the provided URL.")
+        return 0
+    total_chunks = _add_document_to_collection(document, url)
+    print(f"Indexed {total_chunks} chunks from URL: {url}")
+    return total_chunks
+def _get_collection() -> "chromadb.Collection":
+    """
+    Get the collection from the ChromaDB client.
+    :return: The collection object.
+    """
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    print(f"Using collection: {COLLECTION_NAME} with {collection.count()} indexed chunks")
+    return collection
+def _add_document_to_collection(document: str, source: str):
+    """
+    Adds a document to the ChromaDB collection.
+    Args:
+        document (str): The content of the document to be added.
+        source (str): The source URI of the document.s
+    Returns:
+        The upserted document with its metadata in the collection.
+    """
+    collection = _get_collection()
+    document_chunks = llama_index_sentence_splitter(
+        documents=[document],
+        document_ids=[source],
+    )
+    if not document_chunks:
+        print("No document chunks were created. Please check the input document.")
+        return 0
+    collection.upsert(
+        ids=[str(uuid.uuid4().hex) for _ in document_chunks],
+        documents=[chunk["content"] for chunk in document_chunks],
+        metadatas=[
+            {"source": source, "chunk_id": i}
+            for i in range(0, len(document_chunks))
+        ],
+    )
+    return len(document_chunks)
+def _search_k(query: str, k: int = 5):
+    """
+    Search the ChromaDB collection for the top k documents matching the query.
+    Arguments:
+        query (str): The search query.
+        k (int): The number of top results to return.
+    Returns:
+        dict: A dictionary containing the search results, including documents and metadata.
+    """
+    collection = _get_collection()
+    results = collection.query(
+        query_texts=[query],
+        n_results=k,
+        include=["documents", "metadatas", "distances"],
+    )
+    if not results or not results.get("documents"):
+        print("No results found for the query.")
+        return {
+            "documents": [],
+            "metadatas": [],
+            "distances": []
+        }
+    query_results = {
+        "documents": results["documents"][0],
+        "metadatas": results["metadatas"][0],
+        "distances": results["distances"][0]
+    }
+    return query_results

scrape.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+from bs4 import BeautifulSoup
+def get_url_content(url: str) -> str:
+    """
+    Retrieve the content of a URL.
+    :param url: The URL to retrieve content from.
+    :return: The content of the URL as a string.
+    """
+    response = requests.get(
+        url,
+        headers={
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+    )
+    if response.status_code != 200:
+        print(f"Failed to retrieve content from {url}. Status code: {response.status_code} - {response.reason}")
+        return ""
+    # parse the html content using BeautifulSoup
+    parser = BeautifulSoup(response.text, 'html.parser')
+    # extract text from the parsed HTML
+    return parser.text.strip() if parser.text else ""

web_search.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import aiohttp
+import certifi
+import os
+import ssl
+from typing import Dict, Any
+SERPER_API_KEY = str.strip(os.getenv("SERPER_API_KEY", ""))
+AIOHTTP_TIMEOUT = int(os.getenv("AIOHTTP_TIMEOUT", "15"))
+if not SERPER_API_KEY:
+    raise ValueError("SERPER_API_KEY environment variable is not set.")
+async def google(q: str, results: int = 5) -> Dict[str, Any]:
+    url = "https://google.serper.dev/search"
+    return await fetch_json(url, {
+        "q": q,
+        "num": results,
+        "page": 1,
+    })
+async def fetch_json(url: str, payload: dict) -> Dict[str, Any]:
+    headers = {
+        'X-API-KEY': SERPER_API_KEY,
+        'Content-Type': 'application/json'
+    }
+    ssl_context = ssl.create_default_context(cafile=certifi.where())
+    connector = aiohttp.TCPConnector(ssl=ssl_context)
+    timeout = aiohttp.ClientTimeout(total=AIOHTTP_TIMEOUT)
+    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+        async with session.post(url, headers=headers, json=payload) as response:
+            response.raise_for_status()
+            return await response.json()