import os import re import json import subprocess import ast def show_project_structure(structure, spacing=0) -> str: """pprint the project structure""" pp_string = '' for key, value in structure.items(): if '.' in key and '.py' not in key: continue # skip none python files # TODO: maybe we should skip the test files... if key.startswith('test'): continue # skip the test files as well... if '.' in key: pp_string += ' ' * spacing + str(key) + '\n' else: pp_string += ' ' * spacing + str(key) + '/' + '\n' if 'classes' not in value: pp_string += show_project_structure(value, spacing + 4) return pp_string # def clone_github_repo(github_url, local_path): # """Clone GitHub repository to local path""" # try: # subprocess.run(['git', 'clone', github_url, local_path], check=True) # print(f"Successfully cloned repository to: {local_path}") # except subprocess.CalledProcessError as e: # print(f"Warning: Repository cloning may have failed: {e}") def clone_github_repo(github_url, local_path, commit_hash=None): """Clone GitHub repository to local path and optionally checkout specific commit""" try: subprocess.run(['git', 'clone', github_url, local_path], check=True) print(f"Successfully cloned repository to: {local_path}") # If commit hash is provided, checkout to that specific commit if commit_hash: subprocess.run(['git', 'checkout', commit_hash], cwd=local_path, check=True) print(f"Successfully checked out to commit: {commit_hash}") except subprocess.CalledProcessError as e: print(f"Warning: Repository cloning or checkout may have failed: {e}") def parse_python_file(file_path, file_content=None): """Parse a Python file to extract class and function definitions with their line numbers. :param file_path: Path to the Python file. :return: Class names, function names, and file contents """ if file_content is None: try: with open(file_path, "r") as file: file_content = file.read() parsed_data = ast.parse(file_content) except Exception as e: # Catch all types of exceptions print(f"Error in file {file_path}: {e}") return [], [], "" else: try: parsed_data = ast.parse(file_content) except Exception as e: # Catch all types of exceptions print(f"Error in file {file_path}: {e}") return [], [], "" class_info = [] function_names = [] class_methods = set() for node in ast.walk(parsed_data): if isinstance(node, ast.ClassDef): methods = [] for n in node.body: if isinstance(n, ast.FunctionDef): methods.append( { "name": n.name, "start_line": n.lineno, "end_line": n.end_lineno, "text": file_content.splitlines()[ n.lineno - 1 : n.end_lineno ], } ) class_methods.add(n.name) class_info.append( { "name": node.name, "start_line": node.lineno, "end_line": node.end_lineno, "text": file_content.splitlines()[ node.lineno - 1 : node.end_lineno ], "methods": methods, } ) elif isinstance(node, ast.FunctionDef) and not isinstance( node, ast.AsyncFunctionDef ): if node.name not in class_methods: function_names.append( { "name": node.name, "start_line": node.lineno, "end_line": node.end_lineno, "text": file_content.splitlines()[ node.lineno - 1 : node.end_lineno ], } ) return class_info, function_names, file_content.splitlines() def create_structure(directory_path): """Create the structure of the repository directory by parsing Python files. :param directory_path: Path to the repository directory. :return: A dictionary representing the structure. """ structure = {} for root, _, files in os.walk(directory_path): repo_name = os.path.basename(directory_path) relative_root = os.path.relpath(root, directory_path) if relative_root == ".": relative_root = repo_name curr_struct = structure for part in relative_root.split(os.sep): if part not in curr_struct: curr_struct[part] = {} curr_struct = curr_struct[part] for file_name in files: if file_name.endswith(".py"): file_path = os.path.join(root, file_name) class_info, function_names, file_lines = parse_python_file(file_path) curr_struct[file_name] = { "classes": class_info, "functions": function_names, "text": file_lines, } else: curr_struct[file_name] = {} return structure def build_repo_structure(root_path): """Build repository structure using improved parsing method""" return create_structure(root_path) def get_loc_prompt(issue_text,repo_structure): obtain_relevant_files_prompt = """ Please look through the following GitHub problem description and Repository structure and provide a list of files that one would need to edit to fix the problem. ### GitHub Problem Description ### {problem_statement} ### ### Repository Structure ### {structure} ### Please only provide the full path and return at most 5 files. The returned files should be separated by new lines ordered by most to least important and wrapped with ``` For example: ``` file1.py file2.py ``` """ prompt_content = obtain_relevant_files_prompt.format(problem_statement=issue_text,structure=repo_structure) return prompt_content def get_repair_prompt(issue_text,file_content): repair_prompt_combine_topn_cot_diff = """ We are currently solving the following issue within our repository. Here is the issue text: --- BEGIN ISSUE --- {problem_statement} --- END ISSUE --- Below are some code segments, each from a relevant file. One or more of these files may contain bugs. --- BEGIN FILE --- ``` {content} ``` --- END FILE --- Please first localize the bug based on the issue statement, and then generate *SEARCH/REPLACE* edits to fix the issue. Every *SEARCH/REPLACE* edit must use this format: 1. The file path 2. The start of search block: <<<<<<< SEARCH 3. A contiguous chunk of lines to search for in the existing source code 4. The dividing line: ======= 5. The lines to replace into the source code 6. The end of the replace block: >>>>>>> REPLACE Here is an example: ```python ### mathweb/flask/app.py <<<<<<< SEARCH from flask import Flask ======= import math from flask import Flask >>>>>>> REPLACE ``` Please note that the *SEARCH/REPLACE* edit REQUIRES PROPER INDENTATION. If you would like to add the line ' print(x)', you must fully write that out, with all those spaces before the code! Wrap the *SEARCH/REPLACE* edit in blocks ```python...```. """ prompt_content = repair_prompt_combine_topn_cot_diff.format(problem_statement=issue_text,content=file_content.rstrip()) return prompt_content def get_repo_files(structure, filepaths: list[str]): files, classes, functions = get_full_file_paths_and_classes_and_functions(structure) file_contents = dict() for filepath in filepaths: content = None for file_content in files: if file_content[0] == filepath: content = '\n'.join(file_content[1]) file_contents[filepath] = content break # assert content is not None, "file not found" return file_contents def correct_file_path_in_structure(file_name, structure): """ Search for the correct file path in the structure, mainly checking first-level subdirectories Args: file_name (str): File name to search for structure (dict): Repository structure Returns: str: Correct file path if found, otherwise returns original file_name """ # Search in current directory file_contents = get_repo_files(structure, [file_name]) if file_contents != {}: return file_name # Only check first-level subdirectories for sub_dir in structure.keys(): if isinstance(structure[sub_dir], dict): file_contents = get_repo_files(structure[sub_dir], [file_name]) if file_contents != {}: return f'{sub_dir}/{file_name}' return file_name def get_full_file_paths_and_classes_and_functions(structure, current_path=''): """ Recursively retrieve all file paths, classes, and functions within a directory structure. Arguments: structure -- a dictionary representing the directory structure current_path -- the path accumulated so far, used during recursion (default="") Returns: A tuple containing: - files: list of full file paths - classes: list of class details with file paths - functions: list of function details with file paths """ files = [] classes = [] functions = [] for name, content in structure.items(): if isinstance(content, dict): if ( ( 'functions' not in content.keys() and 'classes' not in content.keys() and 'text' not in content.keys() ) or not len(content.keys()) == 3 or ( isinstance(content.get('text', []), dict) or isinstance(content.get('functions', []), dict) or isinstance(content.get('classes', []), dict) ) ): # or guards against case where functions and classes are somehow part of the structure. next_path = f'{current_path}/{name}' if current_path else name ( sub_files, sub_classes, sub_functions, ) = get_full_file_paths_and_classes_and_functions(content, next_path) files.extend(sub_files) classes.extend(sub_classes) functions.extend(sub_functions) else: next_path = f'{current_path}/{name}' if current_path else name files.append((next_path, content.get('text', []))) if content.get('text', []) == []: continue if 'classes' in content: for clazz in content['classes']: classes.append( { 'file': next_path, 'name': clazz['name'], 'start_line': clazz['start_line'], 'end_line': clazz['end_line'], 'methods': [ { 'name': method['name'], 'start_line': method['start_line'], 'end_line': method['end_line'], } for method in clazz.get('methods', []) ], }, ) if 'functions' in content: for function in content['functions']: try: function['file'] = next_path except TypeError: continue functions.append(function) else: next_path = f'{current_path}/{name}' if current_path else name files.append(next_path) return files, classes, functions def post_process(response: str) -> str: content = response if "◁/think▷" in content: content = content.replace("◁think▷", "") parts = content.split("◁/think▷") content = parts[-1] # Extract content between triple backticks (```) matches = re.findall(r"```.*?```", content, re.DOTALL) if matches: matches = [item.replace("```","") for item in matches] return "\n".join(matches) # Return all matched code blocks joined by new lines return content # If no match, return the full response