Spaces:
Sleeping
Sleeping
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from langchain_community.document_loaders.github import GithubFileLoader | |
| from tqdm import tqdm | |
| def fetch_file_content(loader, path): | |
| try: | |
| file = loader.get_file_content_by_path(path) | |
| if file is not None: | |
| return file, path | |
| except Exception as e: | |
| print(f"Error fetching file {path}: {e}") | |
| return None, path | |
| def ingest_github_repo(repo_name: str, access_token: str): | |
| """ | |
| Ingests files from a GitHub repository and returns the files as a list of strings. | |
| Args: | |
| repo_name: str | |
| The name of the GitHub repository in the format "username/repo_name". | |
| access_token: str | |
| The GitHub access token to access the repository. | |
| Returns: | |
| list | |
| A list of strings containing the contents of the files in the repository. | |
| """ | |
| if access_token != "": | |
| loader = GithubFileLoader( | |
| repo=repo_name, | |
| access_token=access_token, | |
| ) | |
| else: | |
| print("No access token provided. Using public access.") | |
| # List the directory contents for the repository | |
| file_paths = loader.get_file_paths() | |
| files = [] | |
| print("Ingesting files from the repository...") | |
| with ThreadPoolExecutor() as executor: | |
| futures = { | |
| executor.submit(fetch_file_content, loader, file_path["path"]): file_path | |
| for file_path in file_paths | |
| } | |
| for future in tqdm(as_completed(futures), total=len(futures)): | |
| file = future.result() | |
| if file is not None: | |
| files.append(file) | |
| return files | |