import os import re import json import subprocess import ast import difflib def show_project_structure(structure, spacing=0) -> str: """pprint the project structure""" pp_string = '' for key, value in structure.items(): if '.' in key and '.py' not in key: continue # skip none python files # TODO: maybe we should skip the test files... if key.startswith('test'): continue # skip the test files as well... if '.' in key: pp_string += ' ' * spacing + str(key) + '\n' else: pp_string += ' ' * spacing + str(key) + '/' + '\n' if 'text' not in value: pp_string += show_project_structure(value, spacing + 4) return pp_string # def clone_github_repo(github_url, local_path): # """Clone GitHub repository to local path""" # try: # subprocess.run(['git', 'clone', github_url, local_path], check=True) # print(f"Successfully cloned repository to: {local_path}") # except subprocess.CalledProcessError as e: # print(f"Warning: Repository cloning may have failed: {e}") def clone_github_repo(github_url, local_path, commit_hash=None): """Clone GitHub repository to local path and optionally checkout specific commit""" try: subprocess.run(['git', 'clone', github_url, local_path], check=True) print(f"Successfully cloned repository to: {local_path}") # If commit hash is provided, checkout to that specific commit if commit_hash: subprocess.run(['git', 'checkout', commit_hash], cwd=local_path, check=True) print(f"Successfully checked out to commit: {commit_hash}") except subprocess.CalledProcessError as e: print(f"Warning: Repository cloning or checkout may have failed: {e}") def parse_python_file(file_path, file_content=None): """Parse a Python file to extract class and function definitions with their line numbers. :param file_path: Path to the Python file. :return: Class names, function names, and file contents """ if file_content is None: try: with open(file_path, "r") as file: file_content = file.read() parsed_data = ast.parse(file_content) except Exception as e: # Catch all types of exceptions print(f"Error in file {file_path}: {e}") return [], [], "" else: try: parsed_data = ast.parse(file_content) except Exception as e: # Catch all types of exceptions print(f"Error in file {file_path}: {e}") return [], [], "" class_info = [] function_names = [] class_methods = set() for node in ast.walk(parsed_data): if isinstance(node, ast.ClassDef): methods = [] for n in node.body: if isinstance(n, ast.FunctionDef): methods.append( { "name": n.name, "start_line": n.lineno, "end_line": n.end_lineno, "text": file_content.splitlines()[ n.lineno - 1 : n.end_lineno ], } ) class_methods.add(n.name) class_info.append( { "name": node.name, "start_line": node.lineno, "end_line": node.end_lineno, "text": file_content.splitlines()[ node.lineno - 1 : node.end_lineno ], "methods": methods, } ) elif isinstance(node, ast.FunctionDef) and not isinstance( node, ast.AsyncFunctionDef ): if node.name not in class_methods: function_names.append( { "name": node.name, "start_line": node.lineno, "end_line": node.end_lineno, "text": file_content.splitlines()[ node.lineno - 1 : node.end_lineno ], } ) return class_info, function_names, file_content.splitlines() # def create_structure(directory_path): # """Create the structure of the repository directory by parsing Python files. # :param directory_path: Path to the repository directory. # :return: A dictionary representing the structure. # """ # structure = {} # for root, _, files in os.walk(directory_path): # repo_name = os.path.basename(directory_path) # relative_root = os.path.relpath(root, directory_path) # if relative_root == ".": # relative_root = repo_name # curr_struct = structure # for part in relative_root.split(os.sep): # if part not in curr_struct: # curr_struct[part] = {} # curr_struct = curr_struct[part] # for file_name in files: # if file_name.endswith(".py"): # file_path = os.path.join(root, file_name) # class_info, function_names, file_lines = parse_python_file(file_path) # curr_struct[file_name] = { # "classes": class_info, # "functions": function_names, # "text": file_lines, # } # else: # curr_struct[file_name] = {} # return structure def create_structure(directory_path): """Create the structure of the repository directory by parsing Python files. :param directory_path: Path to the repository directory. :return: A dictionary representing the structure. """ structure = {} for root, dirs, files in os.walk(directory_path): relative_root = os.path.relpath(root, directory_path) # Build the current directory position in the structure if relative_root == ".": curr_struct = structure else: curr_struct = structure # Split by path separator and create directory structure layer by layer path_parts = relative_root.split(os.sep) for part in path_parts: if part not in curr_struct: curr_struct[part] = {} curr_struct = curr_struct[part] # First create empty dictionary structure for all subdirectories for dir_name in dirs: # Skip hidden directories and common ignored directories if not dir_name.startswith('.') and dir_name not in ['__pycache__', 'node_modules']: if dir_name not in curr_struct: curr_struct[dir_name] = {} # Process all files in the current directory for file_name in files: # Skip hidden files and compiled files if file_name.startswith('.') or file_name.endswith('.pyc'): continue file_path = os.path.join(root, file_name) if file_name.endswith(".py"): # Python files: parse class and function information try: class_info, function_names, file_lines = parse_python_file(file_path) curr_struct[file_name] = { "classes": class_info, "functions": function_names, "text": file_lines, } except Exception as e: print(f"Failed to parse Python file {file_path}: {e}") curr_struct[file_name] = {"text": []} else: code_extensions = ['.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bat', '.ps1', '.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf', '.md', '.txt', '.rst', '.tex', '.r', '.R', '.m', '.pl', '.lua'] if any(file_name.endswith(ext) for ext in code_extensions): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: file_content = f.read() curr_struct[file_name] = {"text": file_content.splitlines()} except Exception as e: print(f"Failed to read file {file_path}: {e}") curr_struct[file_name] = {"text": []} else: curr_struct[file_name] = {"text": []} return structure def build_repo_structure(root_path): """Build repository structure using improved parsing method""" return create_structure(root_path) def get_loc_prompt(issue_text,repo_structure): obtain_relevant_files_prompt = """ Please look through the following GitHub problem description and Repository structure and provide a list of files that one would need to edit to fix the problem. ### GitHub Problem Description ### {problem_statement} ### ### Repository Structure ### {structure} ### Please only provide the full path and return at most 5 files. The returned files should be separated by new lines ordered by most to least important and wrapped with ``` For example: ``` file1.py file2.py ``` """ prompt_content = obtain_relevant_files_prompt.format(problem_statement=issue_text,structure=repo_structure) return prompt_content def get_repair_prompt(issue_text,file_content): repair_prompt_combine_topn_cot_diff = """ We are currently solving the following issue within our repository. Here is the issue text: --- BEGIN ISSUE --- {problem_statement} --- END ISSUE --- Below are some code segments, each from a relevant file. One or more of these files may contain bugs. --- BEGIN FILE --- ``` {content} ``` --- END FILE --- Please first localize the bug based on the issue statement, and then generate *SEARCH/REPLACE* edits to fix the issue. Every *SEARCH/REPLACE* edit must use this format: 1. The file path 2. The start of search block: <<<<<<< SEARCH 3. A contiguous chunk of lines to search for in the existing source code 4. The dividing line: ======= 5. The lines to replace into the source code 6. The end of the replace block: >>>>>>> REPLACE Here is an example: ```python ### mathweb/flask/app.py <<<<<<< SEARCH from flask import Flask ======= import math from flask import Flask >>>>>>> REPLACE ``` Please note that the *SEARCH/REPLACE* edit REQUIRES PROPER INDENTATION. If you would like to add the line ' print(x)', you must fully write that out, with all those spaces before the code! Wrap the *SEARCH/REPLACE* edit in blocks ```python...```. """ prompt_content = repair_prompt_combine_topn_cot_diff.format(problem_statement=issue_text,content=file_content.rstrip()) return prompt_content def get_repo_files(structure, filepaths: list[str]): files, classes, functions = get_full_file_paths_and_classes_and_functions(structure) file_contents = dict() for filepath in filepaths: content = None for file_content in files: if file_content[0] == filepath: content = '\n'.join(file_content[1]) file_contents[filepath] = content break # assert content is not None, "file not found" return file_contents def correct_file_path_in_structure(file_name, structure): """ Search for the correct file path in the structure, mainly checking first-level subdirectories Args: file_name (str): File name to search for structure (dict): Repository structure Returns: str: Correct file path if found, otherwise returns original file_name """ # Search in current directory file_contents = get_repo_files(structure, [file_name]) if file_contents != {}: return file_name # Only check first-level subdirectories for sub_dir in structure.keys(): if isinstance(structure[sub_dir], dict): file_contents = get_repo_files(structure[sub_dir], [file_name]) if file_contents != {}: return f'{sub_dir}/{file_name}' return file_name def get_full_file_paths_and_classes_and_functions(structure, current_path=''): """ Recursively retrieve all file paths, classes, and functions within a directory structure. Arguments: structure -- a dictionary representing the directory structure current_path -- the path accumulated so far, used during recursion (default="") Returns: A tuple containing: - files: list of full file paths - classes: list of class details with file paths - functions: list of function details with file paths """ files = [] classes = [] functions = [] for name, content in structure.items(): if isinstance(content, dict): if ( ( 'functions' not in content.keys() and 'classes' not in content.keys() and 'text' not in content.keys() ) or not len(content.keys()) == 3 or ( isinstance(content.get('text', []), dict) or isinstance(content.get('functions', []), dict) or isinstance(content.get('classes', []), dict) ) ): # or guards against case where functions and classes are somehow part of the structure. next_path = f'{current_path}/{name}' if current_path else name ( sub_files, sub_classes, sub_functions, ) = get_full_file_paths_and_classes_and_functions(content, next_path) files.extend(sub_files) classes.extend(sub_classes) functions.extend(sub_functions) else: next_path = f'{current_path}/{name}' if current_path else name files.append((next_path, content.get('text', []))) if content.get('text', []) == []: continue if 'classes' in content: for clazz in content['classes']: classes.append( { 'file': next_path, 'name': clazz['name'], 'start_line': clazz['start_line'], 'end_line': clazz['end_line'], 'methods': [ { 'name': method['name'], 'start_line': method['start_line'], 'end_line': method['end_line'], } for method in clazz.get('methods', []) ], }, ) if 'functions' in content: for function in content['functions']: try: function['file'] = next_path except TypeError: continue functions.append(function) else: next_path = f'{current_path}/{name}' if current_path else name files.append(next_path) return files, classes, functions def post_process(response: str) -> str: content = response if "◁/think▷" in content: content = content.replace("◁think▷", "") parts = content.split("◁/think▷") content = parts[-1] # Extract content between triple backticks (```) matches = re.findall(r"```.*?```", content, re.DOTALL) if matches: matches = [item.replace("```","") for item in matches] return "\n".join(matches) # Return all matched code blocks joined by new lines return content # If no match, return the full response def correct_file_paths(model_found_files, files, similarity_threshold=0.8): found_files = [] all_file_paths = [file_content[0] for file_content in files] if model_found_files: for model_file in model_found_files: match_found = False for file_path in all_file_paths: if model_file == file_path: found_files.append(file_path) match_found = True break elif file_path.endswith(model_file): found_files.append(file_path) match_found = True break elif os.path.basename(file_path) == os.path.basename(model_file): found_files.append(file_path) match_found = True break if not match_found: close_matches = difflib.get_close_matches(model_file, all_file_paths, n=1, cutoff=similarity_threshold) if close_matches: found_files.append(close_matches[0]) return found_files else: return []