Spaces:
Sleeping
Sleeping
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from langchain_community.document_loaders.github import GithubFileLoader | |
from tqdm import tqdm | |
def fetch_file_content(loader, path): | |
try: | |
file = loader.get_file_content_by_path(path) | |
if file is not None: | |
return file, path | |
except Exception as e: | |
print(f"Error fetching file {path}: {e}") | |
return None, path | |
def ingest_github_repo(repo_name: str, access_token: str): | |
""" | |
Ingests files from a GitHub repository and returns the files as a list of strings. | |
Args: | |
repo_name: str | |
The name of the GitHub repository in the format "username/repo_name". | |
access_token: str | |
The GitHub access token to access the repository. | |
Returns: | |
list | |
A list of strings containing the contents of the files in the repository. | |
""" | |
if access_token != "": | |
loader = GithubFileLoader( | |
repo=repo_name, | |
access_token=access_token, | |
) | |
else: | |
print("No access token provided. Using public access.") | |
# List the directory contents for the repository | |
file_paths = loader.get_file_paths() | |
files = [] | |
print("Ingesting files from the repository...") | |
with ThreadPoolExecutor() as executor: | |
futures = { | |
executor.submit(fetch_file_content, loader, file_path["path"]): file_path | |
for file_path in file_paths | |
} | |
for future in tqdm(as_completed(futures), total=len(futures)): | |
file = future.result() | |
if file is not None: | |
files.append(file) | |
return files | |