mindify-chat-api-demo / cura /github_ingestion.py
MarkChenX's picture
Update cura/github_ingestion.py
395dce9 verified
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.document_loaders.github import GithubFileLoader
from tqdm import tqdm
def fetch_file_content(loader, path):
try:
file = loader.get_file_content_by_path(path)
if file is not None:
return file, path
except Exception as e:
print(f"Error fetching file {path}: {e}")
return None, path
def ingest_github_repo(repo_name: str, access_token: str):
"""
Ingests files from a GitHub repository and returns the files as a list of strings.
Args:
repo_name: str
The name of the GitHub repository in the format "username/repo_name".
access_token: str
The GitHub access token to access the repository.
Returns:
list
A list of strings containing the contents of the files in the repository.
"""
if access_token != "":
loader = GithubFileLoader(
repo=repo_name,
access_token=access_token,
)
else:
print("No access token provided. Using public access.")
# List the directory contents for the repository
file_paths = loader.get_file_paths()
files = []
print("Ingesting files from the repository...")
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(fetch_file_content, loader, file_path["path"]): file_path
for file_path in file_paths
}
for future in tqdm(as_completed(futures), total=len(futures)):
file = future.result()
if file is not None:
files.append(file)
return files