Spaces:

MarkChenX
/

mindify-chat-api-demo

Sleeping

App Files Files Community

MarkChenX commited on Jul 22, 2024

Commit

1585a60

verified ·

1 Parent(s): 1a736cb

Upload 7 files

Browse files

Files changed (5) hide show

cura/chatbot.py +16 -0
cura/github_ingestion.py +33 -23
cura/vector_store.py +24 -24
index.py +30 -55
requirements.txt +3 -3

cura/chatbot.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from openai import OpenAI
+def ask_question(query: str):
+    client = OpenAI()
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a code assistant."},
+                {"role": "user", "content": query},
+            ],
+            max_tokens=1024,
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Failed to ask question: {e}"

cura/github_ingestion.py CHANGED Viewed

@@ -1,11 +1,18 @@
-"""
-GitHub Repo File Ingestion and Indexing
-"""
 from langchain_community.document_loaders.github import GithubFileLoader
 from tqdm import tqdm
 def ingest_github_repo(repo_name: str, access_token: str):
     """
     Ingests files from a GitHub repository and returns the files as a list of strings.
@@ -20,27 +27,30 @@ def ingest_github_repo(repo_name: str, access_token: str):
     list
         A list of strings containing the contents of the files in the repository.
     """
-    loader = GithubFileLoader(
-        repo=repo_name,
-        access_token=access_token,
-    )
     # List the directory contents for the repository
     file_paths = loader.get_file_paths()
-    # Load the files from the repository using curl
     files = []
     print("Ingesting files from the repository...")
-    for i in tqdm(range(len(file_paths))):
-        try:
-            file = loader.get_file_content_by_path(file_paths[i]["path"])
-            # If the file is not textual file, skip it
-            if file is None:
-                continue
-            else:
                 files.append(file)
-        except:
-            continue
-    return files, file_paths

+from concurrent.futures import ThreadPoolExecutor, as_completed
 from langchain_community.document_loaders.github import GithubFileLoader
 from tqdm import tqdm
+def fetch_file_content(loader, path):
+    try:
+        file = loader.get_file_content_by_path(path)
+        if file is not None:
+            return file, path
+    except Exception as e:
+        print(f"Error fetching file {path}: {e}")
+    return None, path
 def ingest_github_repo(repo_name: str, access_token: str):
     """
     Ingests files from a GitHub repository and returns the files as a list of strings.
     list
         A list of strings containing the contents of the files in the repository.
     """
+    if access_token is not "":
+        loader = GithubFileLoader(
+            repo=repo_name,
+            access_token=access_token,
+        )
+    else:
+        print("No access token provided. Using public access.")
     # List the directory contents for the repository
     file_paths = loader.get_file_paths()
     files = []
     print("Ingesting files from the repository...")
+    with ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(fetch_file_content, loader, file_path["path"]): file_path
+            for file_path in file_paths
+        }
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            file = future.result()
+            if file is not None:
                 files.append(file)
+    return files

cura/vector_store.py CHANGED Viewed

@@ -1,25 +1,26 @@
 """
 Vector Store for Mindify Chat
 """
 import chromadb
 def set_up_chromadb(collection_name: str):
     """
     Set up a ChromaDB collection for storing vectors.
     Args:
     collection_name: str
         The name of the collection to create or retrieve.
     Returns:
     ChromaDB Collection
         The ChromaDB collection object.
     """
     chroma_client = chromadb.Client()
-    try:
         # Check if the collection already exists
         collection = chroma_client.get_collection(name=collection_name)
         return collection
@@ -29,51 +30,50 @@ def set_up_chromadb(collection_name: str):
         return collection
-def index_vector_store(collection_name:str, files: list):
     """
     Index the files in the ChromaDB collection.
     Args:
     collection: ChromaDB Collection
         The collection to store the vectors in.
     files: list
         A list of strings containing the contents of the files.
     Returns:
     bool
         True if the data is stored successfully, False otherwise.
     """
-    # Set up collection
-    try:
-        collection = chromadb.Client().get_collection(name=collection_name)
-    except:
-        collection = chromadb.Client().create_collection(name=collection_name)
     print("Indexing files...")
     ids = []
     for i in range(len(files[0])):
         ids.append(str(i))
     print("Storing GitHub data in ChromaDB...")
     try:
         collection.add(ids=ids, documents=files[0])
         print("Data stored successfully!")
-        return True
     except:
         print("Error storing data in ChromaDB")
         return False
-def query_vector_store(collection_name: str, query: str):
     """
     Query the ChromaDB collection for similar vectors to the query vector.
     """
     print("Querying ChromaDB...")
     try:
-        list_collection = chromadb.Client().list_collections()
-        print(list_collection)
-        collection = chromadb.Client().get_collection(name=collection_name)
-        return collection.query(query_texts=query, n_results=5)
     except:
         print("Error querying ChromaDB")
-        return None

 """
 Vector Store for Mindify Chat
 """
 import chromadb
+from chromadb import Collection
 def set_up_chromadb(collection_name: str):
     """
     Set up a ChromaDB collection for storing vectors.
     Args:
     collection_name: str
         The name of the collection to create or retrieve.
     Returns:
     ChromaDB Collection
         The ChromaDB collection object.
     """
     chroma_client = chromadb.Client()
+    try:
         # Check if the collection already exists
         collection = chroma_client.get_collection(name=collection_name)
         return collection
         return collection
+def index_vector_store_chroma(collection: Collection, files: list):
     """
     Index the files in the ChromaDB collection.
     Args:
     collection: ChromaDB Collection
         The collection to store the vectors in.
     files: list
         A list of strings containing the contents of the files.
     Returns:
     bool
         True if the data is stored successfully, False otherwise.
     """
+    # Set up collection
     print("Indexing files...")
     ids = []
     for i in range(len(files[0])):
         ids.append(str(i))
     print("Storing GitHub data in ChromaDB...")
     try:
         collection.add(ids=ids, documents=files[0])
         print("Data stored successfully!")
+        return True, collection
     except:
         print("Error storing data in ChromaDB")
         return False
+def query_vector_store_chroma(collection: Collection, query: str):
     """
     Query the ChromaDB collection for similar vectors to the query vector.
     """
     print("Querying ChromaDB...")
     try:
+        results = collection.query(
+            query_texts=[query],
+            n_results=2,
+        )
+        print("Query successful!")
+        return results["documents"][0][0]
     except:
         print("Error querying ChromaDB")
+        return None

index.py CHANGED Viewed

@@ -1,14 +1,12 @@
-from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from .database import post_github_access_token, post_github_repo, get_github_access_token
-from .cura import github_ingestion, vector_store
-from pydantic import BaseModel
 app = FastAPI(
-    title="Mindify Chat API",
-    description="API for Mindify Chat",
-    version="0.1"
 )
 app.add_middleware(
@@ -16,62 +14,39 @@ app.add_middleware(
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
-    allow_headers=["*"]
 )
-# Define request body models
-class AccessTokenRequest(BaseModel):
-    token: str
-    user_email: str
-class RepoRequest(BaseModel):
-    repo_name: str
-    user_email: str
 @app.get("/")
 def read_root():
     return {"Hello": "World"}
-@app.post("/github/access_token")
-def post_github_access_token_route(request: AccessTokenRequest):
-    token = request.token
-    user_email = request.user_email
-    post_github_access_token(token, user_email)
-    return {"status": "success"}
-@app.post("/github/repo")
-def post_github_repo_route(request: RepoRequest):
-    repo_name = request.repo_name
-    user_email = request.user_email
-    post_github_repo(repo_name, user_email)
-    return {"status": "success"}
-@app.post("/github/index")
-def index_github_repo_route(request: RepoRequest):
-    repo_name = request.repo_name
-    user_email = request.user_email
-    access_token = get_github_access_token(user_email)
-    collection_name = repo_name.replace("/", "_")
-    if access_token is not None:
-        files = github_ingestion.ingest_github_repo(repo_name, access_token)
-        results = vector_store.index_vector_store(files=files, collection_name = collection_name)
-        if results:
-            return {"status": "success", "message": "GitHub data stored in ChromaDB"}
-        else:
-            return {"status": "error", "message": "Failed to set up ChromaDB collection"}
-    else:
-        return {"status": "error", "message": "Failed to get GitHub access token"}
-@app.post("/github/query")
-def query_github_repo_route(repo_name: str, query: str):
     collection_name = repo_name.replace("/", "_")
-    if collection_name is not None:
-        response = vector_store.query_vector_store(collection_name=collection_name, query=query)
-        return {"status": "success", "response": response}
     else:
-        return {"status": "error", "message": "Failed to set up ChromaDB collection"}
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app)

+from fastapi import FastAPI, Form
 from fastapi.middleware.cors import CORSMiddleware
+from database import post_github_access_token, post_github_repo, get_github_access_token
+from cura import github_ingestion, vector_store, chatbot
+import gradio as gr
 app = FastAPI(
+    title="Mindify Chat API", description="API for Mindify Chat", version="0.1"
 )
 app.add_middleware(
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
+    allow_headers=["*"],
 )
 @app.get("/")
 def read_root():
     return {"Hello": "World"}
+@app.post("/chat/query")
+def query_chat_route(query: str = Form(...), repo_name: str = Form(...), token: str = Form(...)):
     collection_name = repo_name.replace("/", "_")
+    if repo_name:
+        files = github_ingestion.ingest_github_repo(repo_name=repo_name, access_token=token)
+        collection = vector_store.set_up_chromadb(collection_name=collection_name)
+        isIndexed,collection = vector_store.index_vector_store_chroma(collection=collection, files=files)
+        if isIndexed:
+            query = vector_store.query_vector_store_chroma(collection=collection, query=query)
     else:
+        print("No repo name provided. Using default collection.")
+        query = chatbot.ask_question(query=query)
+    return {"status": "success", "response": query}
+io = gr.Interface(
+    fn=query_chat_route,
+    title="Mindify Chat",
+    inputs=[gr.Textbox(label="Query"), gr.Textbox(label="Repo Name")],
+    outputs=gr.Code(label="Response", language="markdown"),
+)
+app = gr.mount_gradio_app(app, io, path="/gradio")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app)

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-fastapi==0.85.1
 langchain_community
 langchain_openai
 supabase
 uvicorn
-chromadb==0.3.29
 python-dotenv
-pydantic<2.0,>=1.9
 httpx[http2]

+fastapi
 langchain_community
 langchain_openai
 supabase
 uvicorn
+chromadb
 python-dotenv
+pydantic>=2.0
 httpx[http2]