MarkChenX commited on
Commit
1585a60
·
verified ·
1 Parent(s): 1a736cb

Upload 7 files

Browse files
Files changed (5) hide show
  1. cura/chatbot.py +16 -0
  2. cura/github_ingestion.py +33 -23
  3. cura/vector_store.py +24 -24
  4. index.py +30 -55
  5. requirements.txt +3 -3
cura/chatbot.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+ def ask_question(query: str):
4
+ client = OpenAI()
5
+ try:
6
+ response = client.chat.completions.create(
7
+ model="gpt-4o-mini",
8
+ messages=[
9
+ {"role": "system", "content": "You are a code assistant."},
10
+ {"role": "user", "content": query},
11
+ ],
12
+ max_tokens=1024,
13
+ )
14
+ return response.choices[0].message.content
15
+ except Exception as e:
16
+ return f"Failed to ask question: {e}"
cura/github_ingestion.py CHANGED
@@ -1,11 +1,18 @@
1
-
2
- """
3
- GitHub Repo File Ingestion and Indexing
4
- """
5
-
6
  from langchain_community.document_loaders.github import GithubFileLoader
7
  from tqdm import tqdm
8
 
 
 
 
 
 
 
 
 
 
 
 
9
  def ingest_github_repo(repo_name: str, access_token: str):
10
  """
11
  Ingests files from a GitHub repository and returns the files as a list of strings.
@@ -20,27 +27,30 @@ def ingest_github_repo(repo_name: str, access_token: str):
20
  list
21
  A list of strings containing the contents of the files in the repository.
22
  """
23
- loader = GithubFileLoader(
24
- repo=repo_name,
25
- access_token=access_token,
26
- )
27
-
 
 
 
28
  # List the directory contents for the repository
29
  file_paths = loader.get_file_paths()
30
 
31
- # Load the files from the repository using curl
32
  files = []
33
-
34
  print("Ingesting files from the repository...")
35
- for i in tqdm(range(len(file_paths))):
36
- try:
37
- file = loader.get_file_content_by_path(file_paths[i]["path"])
38
- # If the file is not textual file, skip it
39
- if file is None:
40
- continue
41
- else:
 
 
 
42
  files.append(file)
43
- except:
44
- continue
45
-
46
- return files, file_paths
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
 
2
  from langchain_community.document_loaders.github import GithubFileLoader
3
  from tqdm import tqdm
4
 
5
+
6
+ def fetch_file_content(loader, path):
7
+ try:
8
+ file = loader.get_file_content_by_path(path)
9
+ if file is not None:
10
+ return file, path
11
+ except Exception as e:
12
+ print(f"Error fetching file {path}: {e}")
13
+ return None, path
14
+
15
+
16
  def ingest_github_repo(repo_name: str, access_token: str):
17
  """
18
  Ingests files from a GitHub repository and returns the files as a list of strings.
 
27
  list
28
  A list of strings containing the contents of the files in the repository.
29
  """
30
+ if access_token is not "":
31
+ loader = GithubFileLoader(
32
+ repo=repo_name,
33
+ access_token=access_token,
34
+ )
35
+ else:
36
+ print("No access token provided. Using public access.")
37
+
38
  # List the directory contents for the repository
39
  file_paths = loader.get_file_paths()
40
 
 
41
  files = []
42
+
43
  print("Ingesting files from the repository...")
44
+
45
+ with ThreadPoolExecutor() as executor:
46
+ futures = {
47
+ executor.submit(fetch_file_content, loader, file_path["path"]): file_path
48
+ for file_path in file_paths
49
+ }
50
+
51
+ for future in tqdm(as_completed(futures), total=len(futures)):
52
+ file = future.result()
53
+ if file is not None:
54
  files.append(file)
55
+
56
+ return files
 
 
cura/vector_store.py CHANGED
@@ -1,25 +1,26 @@
1
-
2
  """
3
  Vector Store for Mindify Chat
4
  """
5
 
6
  import chromadb
 
 
7
 
8
  def set_up_chromadb(collection_name: str):
9
  """
10
  Set up a ChromaDB collection for storing vectors.
11
-
12
  Args:
13
  collection_name: str
14
  The name of the collection to create or retrieve.
15
-
16
  Returns:
17
  ChromaDB Collection
18
  The ChromaDB collection object.
19
  """
20
  chroma_client = chromadb.Client()
21
-
22
- try:
23
  # Check if the collection already exists
24
  collection = chroma_client.get_collection(name=collection_name)
25
  return collection
@@ -29,51 +30,50 @@ def set_up_chromadb(collection_name: str):
29
  return collection
30
 
31
 
32
- def index_vector_store(collection_name:str, files: list):
33
  """
34
  Index the files in the ChromaDB collection.
35
-
36
  Args:
37
  collection: ChromaDB Collection
38
  The collection to store the vectors in.
39
  files: list
40
  A list of strings containing the contents of the files.
41
-
42
  Returns:
43
  bool
44
  True if the data is stored successfully, False otherwise.
45
  """
46
- # Set up collection
47
- try:
48
- collection = chromadb.Client().get_collection(name=collection_name)
49
- except:
50
- collection = chromadb.Client().create_collection(name=collection_name)
51
-
52
  print("Indexing files...")
53
  ids = []
54
  for i in range(len(files[0])):
55
  ids.append(str(i))
56
-
57
  print("Storing GitHub data in ChromaDB...")
58
  try:
59
  collection.add(ids=ids, documents=files[0])
60
  print("Data stored successfully!")
61
-
62
- return True
63
  except:
64
  print("Error storing data in ChromaDB")
65
  return False
66
-
67
- def query_vector_store(collection_name: str, query: str):
 
68
  """
69
  Query the ChromaDB collection for similar vectors to the query vector.
70
  """
71
  print("Querying ChromaDB...")
72
  try:
73
- list_collection = chromadb.Client().list_collections()
74
- print(list_collection)
75
- collection = chromadb.Client().get_collection(name=collection_name)
76
- return collection.query(query_texts=query, n_results=5)
 
 
77
  except:
78
  print("Error querying ChromaDB")
79
- return None
 
 
1
  """
2
  Vector Store for Mindify Chat
3
  """
4
 
5
  import chromadb
6
+ from chromadb import Collection
7
+
8
 
9
  def set_up_chromadb(collection_name: str):
10
  """
11
  Set up a ChromaDB collection for storing vectors.
12
+
13
  Args:
14
  collection_name: str
15
  The name of the collection to create or retrieve.
16
+
17
  Returns:
18
  ChromaDB Collection
19
  The ChromaDB collection object.
20
  """
21
  chroma_client = chromadb.Client()
22
+
23
+ try:
24
  # Check if the collection already exists
25
  collection = chroma_client.get_collection(name=collection_name)
26
  return collection
 
30
  return collection
31
 
32
 
33
+ def index_vector_store_chroma(collection: Collection, files: list):
34
  """
35
  Index the files in the ChromaDB collection.
36
+
37
  Args:
38
  collection: ChromaDB Collection
39
  The collection to store the vectors in.
40
  files: list
41
  A list of strings containing the contents of the files.
42
+
43
  Returns:
44
  bool
45
  True if the data is stored successfully, False otherwise.
46
  """
47
+ # Set up collection
48
+
 
 
 
 
49
  print("Indexing files...")
50
  ids = []
51
  for i in range(len(files[0])):
52
  ids.append(str(i))
53
+
54
  print("Storing GitHub data in ChromaDB...")
55
  try:
56
  collection.add(ids=ids, documents=files[0])
57
  print("Data stored successfully!")
58
+
59
+ return True, collection
60
  except:
61
  print("Error storing data in ChromaDB")
62
  return False
63
+
64
+
65
+ def query_vector_store_chroma(collection: Collection, query: str):
66
  """
67
  Query the ChromaDB collection for similar vectors to the query vector.
68
  """
69
  print("Querying ChromaDB...")
70
  try:
71
+ results = collection.query(
72
+ query_texts=[query],
73
+ n_results=2,
74
+ )
75
+ print("Query successful!")
76
+ return results["documents"][0][0]
77
  except:
78
  print("Error querying ChromaDB")
79
+ return None
index.py CHANGED
@@ -1,14 +1,12 @@
1
-
2
- from fastapi import FastAPI
3
  from fastapi.middleware.cors import CORSMiddleware
4
- from .database import post_github_access_token, post_github_repo, get_github_access_token
5
- from .cura import github_ingestion, vector_store
6
- from pydantic import BaseModel
 
7
 
8
  app = FastAPI(
9
- title="Mindify Chat API",
10
- description="API for Mindify Chat",
11
- version="0.1"
12
  )
13
 
14
  app.add_middleware(
@@ -16,62 +14,39 @@ app.add_middleware(
16
  allow_origins=["*"],
17
  allow_credentials=True,
18
  allow_methods=["*"],
19
- allow_headers=["*"]
20
  )
21
 
22
- # Define request body models
23
- class AccessTokenRequest(BaseModel):
24
- token: str
25
- user_email: str
26
-
27
- class RepoRequest(BaseModel):
28
- repo_name: str
29
- user_email: str
30
 
31
  @app.get("/")
32
  def read_root():
33
  return {"Hello": "World"}
34
 
35
- @app.post("/github/access_token")
36
- def post_github_access_token_route(request: AccessTokenRequest):
37
- token = request.token
38
- user_email = request.user_email
39
- post_github_access_token(token, user_email)
40
- return {"status": "success"}
41
-
42
- @app.post("/github/repo")
43
- def post_github_repo_route(request: RepoRequest):
44
- repo_name = request.repo_name
45
- user_email = request.user_email
46
- post_github_repo(repo_name, user_email)
47
- return {"status": "success"}
48
-
49
- @app.post("/github/index")
50
- def index_github_repo_route(request: RepoRequest):
51
- repo_name = request.repo_name
52
- user_email = request.user_email
53
- access_token = get_github_access_token(user_email)
54
- collection_name = repo_name.replace("/", "_")
55
- if access_token is not None:
56
- files = github_ingestion.ingest_github_repo(repo_name, access_token)
57
- results = vector_store.index_vector_store(files=files, collection_name = collection_name)
58
- if results:
59
- return {"status": "success", "message": "GitHub data stored in ChromaDB"}
60
- else:
61
- return {"status": "error", "message": "Failed to set up ChromaDB collection"}
62
-
63
- else:
64
- return {"status": "error", "message": "Failed to get GitHub access token"}
65
-
66
- @app.post("/github/query")
67
- def query_github_repo_route(repo_name: str, query: str):
68
  collection_name = repo_name.replace("/", "_")
69
- if collection_name is not None:
70
- response = vector_store.query_vector_store(collection_name=collection_name, query=query)
71
- return {"status": "success", "response": response}
 
 
 
72
  else:
73
- return {"status": "error", "message": "Failed to set up ChromaDB collection"}
74
-
 
 
 
 
 
 
 
 
 
 
 
 
75
  if __name__ == "__main__":
76
  import uvicorn
 
77
  uvicorn.run(app)
 
1
+ from fastapi import FastAPI, Form
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
+ from database import post_github_access_token, post_github_repo, get_github_access_token
4
+ from cura import github_ingestion, vector_store, chatbot
5
+ import gradio as gr
6
+
7
 
8
  app = FastAPI(
9
+ title="Mindify Chat API", description="API for Mindify Chat", version="0.1"
 
 
10
  )
11
 
12
  app.add_middleware(
 
14
  allow_origins=["*"],
15
  allow_credentials=True,
16
  allow_methods=["*"],
17
+ allow_headers=["*"],
18
  )
19
 
 
 
 
 
 
 
 
 
20
 
21
  @app.get("/")
22
  def read_root():
23
  return {"Hello": "World"}
24
 
25
+ @app.post("/chat/query")
26
+ def query_chat_route(query: str = Form(...), repo_name: str = Form(...), token: str = Form(...)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  collection_name = repo_name.replace("/", "_")
28
+ if repo_name:
29
+ files = github_ingestion.ingest_github_repo(repo_name=repo_name, access_token=token)
30
+ collection = vector_store.set_up_chromadb(collection_name=collection_name)
31
+ isIndexed,collection = vector_store.index_vector_store_chroma(collection=collection, files=files)
32
+ if isIndexed:
33
+ query = vector_store.query_vector_store_chroma(collection=collection, query=query)
34
  else:
35
+ print("No repo name provided. Using default collection.")
36
+ query = chatbot.ask_question(query=query)
37
+
38
+ return {"status": "success", "response": query}
39
+
40
+
41
+ io = gr.Interface(
42
+ fn=query_chat_route,
43
+ title="Mindify Chat",
44
+ inputs=[gr.Textbox(label="Query"), gr.Textbox(label="Repo Name")],
45
+ outputs=gr.Code(label="Response", language="markdown"),
46
+ )
47
+ app = gr.mount_gradio_app(app, io, path="/gradio")
48
+
49
  if __name__ == "__main__":
50
  import uvicorn
51
+
52
  uvicorn.run(app)
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
- fastapi==0.85.1
2
  langchain_community
3
  langchain_openai
4
  supabase
5
  uvicorn
6
- chromadb==0.3.29
7
  python-dotenv
8
- pydantic<2.0,>=1.9
9
  httpx[http2]
 
1
+ fastapi
2
  langchain_community
3
  langchain_openai
4
  supabase
5
  uvicorn
6
+ chromadb
7
  python-dotenv
8
+ pydantic>=2.0
9
  httpx[http2]