from concurrent.futures import ThreadPoolExecutor, as_completed from langchain_community.document_loaders.github import GithubFileLoader from tqdm import tqdm def fetch_file_content(loader, path): try: file = loader.get_file_content_by_path(path) if file is not None: return file, path except Exception as e: print(f"Error fetching file {path}: {e}") return None, path def ingest_github_repo(repo_name: str, access_token: str): """ Ingests files from a GitHub repository and returns the files as a list of strings. Args: repo_name: str The name of the GitHub repository in the format "username/repo_name". access_token: str The GitHub access token to access the repository. Returns: list A list of strings containing the contents of the files in the repository. """ if access_token != "": loader = GithubFileLoader( repo=repo_name, access_token=access_token, ) else: print("No access token provided. Using public access.") # List the directory contents for the repository file_paths = loader.get_file_paths() files = [] print("Ingesting files from the repository...") with ThreadPoolExecutor() as executor: futures = { executor.submit(fetch_file_content, loader, file_path["path"]): file_path for file_path in file_paths } for future in tqdm(as_completed(futures), total=len(futures)): file = future.result() if file is not None: files.append(file) return files