Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- cura/chatbot.py +16 -0
- cura/github_ingestion.py +33 -23
- cura/vector_store.py +24 -24
- index.py +30 -55
- requirements.txt +3 -3
cura/chatbot.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
|
3 |
+
def ask_question(query: str):
|
4 |
+
client = OpenAI()
|
5 |
+
try:
|
6 |
+
response = client.chat.completions.create(
|
7 |
+
model="gpt-4o-mini",
|
8 |
+
messages=[
|
9 |
+
{"role": "system", "content": "You are a code assistant."},
|
10 |
+
{"role": "user", "content": query},
|
11 |
+
],
|
12 |
+
max_tokens=1024,
|
13 |
+
)
|
14 |
+
return response.choices[0].message.content
|
15 |
+
except Exception as e:
|
16 |
+
return f"Failed to ask question: {e}"
|
cura/github_ingestion.py
CHANGED
@@ -1,11 +1,18 @@
|
|
1 |
-
|
2 |
-
"""
|
3 |
-
GitHub Repo File Ingestion and Indexing
|
4 |
-
"""
|
5 |
-
|
6 |
from langchain_community.document_loaders.github import GithubFileLoader
|
7 |
from tqdm import tqdm
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def ingest_github_repo(repo_name: str, access_token: str):
|
10 |
"""
|
11 |
Ingests files from a GitHub repository and returns the files as a list of strings.
|
@@ -20,27 +27,30 @@ def ingest_github_repo(repo_name: str, access_token: str):
|
|
20 |
list
|
21 |
A list of strings containing the contents of the files in the repository.
|
22 |
"""
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
# List the directory contents for the repository
|
29 |
file_paths = loader.get_file_paths()
|
30 |
|
31 |
-
# Load the files from the repository using curl
|
32 |
files = []
|
33 |
-
|
34 |
print("Ingesting files from the repository...")
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
42 |
files.append(file)
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
return files, file_paths
|
|
|
1 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
|
|
|
2 |
from langchain_community.document_loaders.github import GithubFileLoader
|
3 |
from tqdm import tqdm
|
4 |
|
5 |
+
|
6 |
+
def fetch_file_content(loader, path):
|
7 |
+
try:
|
8 |
+
file = loader.get_file_content_by_path(path)
|
9 |
+
if file is not None:
|
10 |
+
return file, path
|
11 |
+
except Exception as e:
|
12 |
+
print(f"Error fetching file {path}: {e}")
|
13 |
+
return None, path
|
14 |
+
|
15 |
+
|
16 |
def ingest_github_repo(repo_name: str, access_token: str):
|
17 |
"""
|
18 |
Ingests files from a GitHub repository and returns the files as a list of strings.
|
|
|
27 |
list
|
28 |
A list of strings containing the contents of the files in the repository.
|
29 |
"""
|
30 |
+
if access_token is not "":
|
31 |
+
loader = GithubFileLoader(
|
32 |
+
repo=repo_name,
|
33 |
+
access_token=access_token,
|
34 |
+
)
|
35 |
+
else:
|
36 |
+
print("No access token provided. Using public access.")
|
37 |
+
|
38 |
# List the directory contents for the repository
|
39 |
file_paths = loader.get_file_paths()
|
40 |
|
|
|
41 |
files = []
|
42 |
+
|
43 |
print("Ingesting files from the repository...")
|
44 |
+
|
45 |
+
with ThreadPoolExecutor() as executor:
|
46 |
+
futures = {
|
47 |
+
executor.submit(fetch_file_content, loader, file_path["path"]): file_path
|
48 |
+
for file_path in file_paths
|
49 |
+
}
|
50 |
+
|
51 |
+
for future in tqdm(as_completed(futures), total=len(futures)):
|
52 |
+
file = future.result()
|
53 |
+
if file is not None:
|
54 |
files.append(file)
|
55 |
+
|
56 |
+
return files
|
|
|
|
cura/vector_store.py
CHANGED
@@ -1,25 +1,26 @@
|
|
1 |
-
|
2 |
"""
|
3 |
Vector Store for Mindify Chat
|
4 |
"""
|
5 |
|
6 |
import chromadb
|
|
|
|
|
7 |
|
8 |
def set_up_chromadb(collection_name: str):
|
9 |
"""
|
10 |
Set up a ChromaDB collection for storing vectors.
|
11 |
-
|
12 |
Args:
|
13 |
collection_name: str
|
14 |
The name of the collection to create or retrieve.
|
15 |
-
|
16 |
Returns:
|
17 |
ChromaDB Collection
|
18 |
The ChromaDB collection object.
|
19 |
"""
|
20 |
chroma_client = chromadb.Client()
|
21 |
-
|
22 |
-
try:
|
23 |
# Check if the collection already exists
|
24 |
collection = chroma_client.get_collection(name=collection_name)
|
25 |
return collection
|
@@ -29,51 +30,50 @@ def set_up_chromadb(collection_name: str):
|
|
29 |
return collection
|
30 |
|
31 |
|
32 |
-
def
|
33 |
"""
|
34 |
Index the files in the ChromaDB collection.
|
35 |
-
|
36 |
Args:
|
37 |
collection: ChromaDB Collection
|
38 |
The collection to store the vectors in.
|
39 |
files: list
|
40 |
A list of strings containing the contents of the files.
|
41 |
-
|
42 |
Returns:
|
43 |
bool
|
44 |
True if the data is stored successfully, False otherwise.
|
45 |
"""
|
46 |
-
# Set up collection
|
47 |
-
|
48 |
-
collection = chromadb.Client().get_collection(name=collection_name)
|
49 |
-
except:
|
50 |
-
collection = chromadb.Client().create_collection(name=collection_name)
|
51 |
-
|
52 |
print("Indexing files...")
|
53 |
ids = []
|
54 |
for i in range(len(files[0])):
|
55 |
ids.append(str(i))
|
56 |
-
|
57 |
print("Storing GitHub data in ChromaDB...")
|
58 |
try:
|
59 |
collection.add(ids=ids, documents=files[0])
|
60 |
print("Data stored successfully!")
|
61 |
-
|
62 |
-
return True
|
63 |
except:
|
64 |
print("Error storing data in ChromaDB")
|
65 |
return False
|
66 |
-
|
67 |
-
|
|
|
68 |
"""
|
69 |
Query the ChromaDB collection for similar vectors to the query vector.
|
70 |
"""
|
71 |
print("Querying ChromaDB...")
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
except:
|
78 |
print("Error querying ChromaDB")
|
79 |
-
return None
|
|
|
|
|
1 |
"""
|
2 |
Vector Store for Mindify Chat
|
3 |
"""
|
4 |
|
5 |
import chromadb
|
6 |
+
from chromadb import Collection
|
7 |
+
|
8 |
|
9 |
def set_up_chromadb(collection_name: str):
|
10 |
"""
|
11 |
Set up a ChromaDB collection for storing vectors.
|
12 |
+
|
13 |
Args:
|
14 |
collection_name: str
|
15 |
The name of the collection to create or retrieve.
|
16 |
+
|
17 |
Returns:
|
18 |
ChromaDB Collection
|
19 |
The ChromaDB collection object.
|
20 |
"""
|
21 |
chroma_client = chromadb.Client()
|
22 |
+
|
23 |
+
try:
|
24 |
# Check if the collection already exists
|
25 |
collection = chroma_client.get_collection(name=collection_name)
|
26 |
return collection
|
|
|
30 |
return collection
|
31 |
|
32 |
|
33 |
+
def index_vector_store_chroma(collection: Collection, files: list):
|
34 |
"""
|
35 |
Index the files in the ChromaDB collection.
|
36 |
+
|
37 |
Args:
|
38 |
collection: ChromaDB Collection
|
39 |
The collection to store the vectors in.
|
40 |
files: list
|
41 |
A list of strings containing the contents of the files.
|
42 |
+
|
43 |
Returns:
|
44 |
bool
|
45 |
True if the data is stored successfully, False otherwise.
|
46 |
"""
|
47 |
+
# Set up collection
|
48 |
+
|
|
|
|
|
|
|
|
|
49 |
print("Indexing files...")
|
50 |
ids = []
|
51 |
for i in range(len(files[0])):
|
52 |
ids.append(str(i))
|
53 |
+
|
54 |
print("Storing GitHub data in ChromaDB...")
|
55 |
try:
|
56 |
collection.add(ids=ids, documents=files[0])
|
57 |
print("Data stored successfully!")
|
58 |
+
|
59 |
+
return True, collection
|
60 |
except:
|
61 |
print("Error storing data in ChromaDB")
|
62 |
return False
|
63 |
+
|
64 |
+
|
65 |
+
def query_vector_store_chroma(collection: Collection, query: str):
|
66 |
"""
|
67 |
Query the ChromaDB collection for similar vectors to the query vector.
|
68 |
"""
|
69 |
print("Querying ChromaDB...")
|
70 |
try:
|
71 |
+
results = collection.query(
|
72 |
+
query_texts=[query],
|
73 |
+
n_results=2,
|
74 |
+
)
|
75 |
+
print("Query successful!")
|
76 |
+
return results["documents"][0][0]
|
77 |
except:
|
78 |
print("Error querying ChromaDB")
|
79 |
+
return None
|
index.py
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
-
|
2 |
-
from fastapi import FastAPI
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
-
from
|
5 |
-
from
|
6 |
-
|
|
|
7 |
|
8 |
app = FastAPI(
|
9 |
-
title="Mindify Chat API",
|
10 |
-
description="API for Mindify Chat",
|
11 |
-
version="0.1"
|
12 |
)
|
13 |
|
14 |
app.add_middleware(
|
@@ -16,62 +14,39 @@ app.add_middleware(
|
|
16 |
allow_origins=["*"],
|
17 |
allow_credentials=True,
|
18 |
allow_methods=["*"],
|
19 |
-
allow_headers=["*"]
|
20 |
)
|
21 |
|
22 |
-
# Define request body models
|
23 |
-
class AccessTokenRequest(BaseModel):
|
24 |
-
token: str
|
25 |
-
user_email: str
|
26 |
-
|
27 |
-
class RepoRequest(BaseModel):
|
28 |
-
repo_name: str
|
29 |
-
user_email: str
|
30 |
|
31 |
@app.get("/")
|
32 |
def read_root():
|
33 |
return {"Hello": "World"}
|
34 |
|
35 |
-
@app.post("/
|
36 |
-
def
|
37 |
-
token = request.token
|
38 |
-
user_email = request.user_email
|
39 |
-
post_github_access_token(token, user_email)
|
40 |
-
return {"status": "success"}
|
41 |
-
|
42 |
-
@app.post("/github/repo")
|
43 |
-
def post_github_repo_route(request: RepoRequest):
|
44 |
-
repo_name = request.repo_name
|
45 |
-
user_email = request.user_email
|
46 |
-
post_github_repo(repo_name, user_email)
|
47 |
-
return {"status": "success"}
|
48 |
-
|
49 |
-
@app.post("/github/index")
|
50 |
-
def index_github_repo_route(request: RepoRequest):
|
51 |
-
repo_name = request.repo_name
|
52 |
-
user_email = request.user_email
|
53 |
-
access_token = get_github_access_token(user_email)
|
54 |
-
collection_name = repo_name.replace("/", "_")
|
55 |
-
if access_token is not None:
|
56 |
-
files = github_ingestion.ingest_github_repo(repo_name, access_token)
|
57 |
-
results = vector_store.index_vector_store(files=files, collection_name = collection_name)
|
58 |
-
if results:
|
59 |
-
return {"status": "success", "message": "GitHub data stored in ChromaDB"}
|
60 |
-
else:
|
61 |
-
return {"status": "error", "message": "Failed to set up ChromaDB collection"}
|
62 |
-
|
63 |
-
else:
|
64 |
-
return {"status": "error", "message": "Failed to get GitHub access token"}
|
65 |
-
|
66 |
-
@app.post("/github/query")
|
67 |
-
def query_github_repo_route(repo_name: str, query: str):
|
68 |
collection_name = repo_name.replace("/", "_")
|
69 |
-
if
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
72 |
else:
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
if __name__ == "__main__":
|
76 |
import uvicorn
|
|
|
77 |
uvicorn.run(app)
|
|
|
1 |
+
from fastapi import FastAPI, Form
|
|
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from database import post_github_access_token, post_github_repo, get_github_access_token
|
4 |
+
from cura import github_ingestion, vector_store, chatbot
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
|
8 |
app = FastAPI(
|
9 |
+
title="Mindify Chat API", description="API for Mindify Chat", version="0.1"
|
|
|
|
|
10 |
)
|
11 |
|
12 |
app.add_middleware(
|
|
|
14 |
allow_origins=["*"],
|
15 |
allow_credentials=True,
|
16 |
allow_methods=["*"],
|
17 |
+
allow_headers=["*"],
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
@app.get("/")
|
22 |
def read_root():
|
23 |
return {"Hello": "World"}
|
24 |
|
25 |
+
@app.post("/chat/query")
|
26 |
+
def query_chat_route(query: str = Form(...), repo_name: str = Form(...), token: str = Form(...)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
collection_name = repo_name.replace("/", "_")
|
28 |
+
if repo_name:
|
29 |
+
files = github_ingestion.ingest_github_repo(repo_name=repo_name, access_token=token)
|
30 |
+
collection = vector_store.set_up_chromadb(collection_name=collection_name)
|
31 |
+
isIndexed,collection = vector_store.index_vector_store_chroma(collection=collection, files=files)
|
32 |
+
if isIndexed:
|
33 |
+
query = vector_store.query_vector_store_chroma(collection=collection, query=query)
|
34 |
else:
|
35 |
+
print("No repo name provided. Using default collection.")
|
36 |
+
query = chatbot.ask_question(query=query)
|
37 |
+
|
38 |
+
return {"status": "success", "response": query}
|
39 |
+
|
40 |
+
|
41 |
+
io = gr.Interface(
|
42 |
+
fn=query_chat_route,
|
43 |
+
title="Mindify Chat",
|
44 |
+
inputs=[gr.Textbox(label="Query"), gr.Textbox(label="Repo Name")],
|
45 |
+
outputs=gr.Code(label="Response", language="markdown"),
|
46 |
+
)
|
47 |
+
app = gr.mount_gradio_app(app, io, path="/gradio")
|
48 |
+
|
49 |
if __name__ == "__main__":
|
50 |
import uvicorn
|
51 |
+
|
52 |
uvicorn.run(app)
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
fastapi
|
2 |
langchain_community
|
3 |
langchain_openai
|
4 |
supabase
|
5 |
uvicorn
|
6 |
-
chromadb
|
7 |
python-dotenv
|
8 |
-
pydantic
|
9 |
httpx[http2]
|
|
|
1 |
+
fastapi
|
2 |
langchain_community
|
3 |
langchain_openai
|
4 |
supabase
|
5 |
uvicorn
|
6 |
+
chromadb
|
7 |
python-dotenv
|
8 |
+
pydantic>=2.0
|
9 |
httpx[http2]
|