import re class Preprocessor: """ Provides methods to preprocess markdown content: - extract a section after a specific keyword - extract directories from markdown links - extract file paths from markdown links """ @staticmethod def extract_section(markdown_content, keyword="Latest commit"): """ Extract lines starting from the line after the one containing 'keyword' up to the next empty line. """ lines = markdown_content.splitlines() extract = [] found = False for i, line in enumerate(lines): if not found and keyword in line: found = True # Start collecting from the next line start_idx = i + 1 while start_idx < len(lines): next_line = lines[start_idx] if next_line.strip() == "": break extract.append(next_line) start_idx += 1 break return "\n".join(extract) if extract else f"No content found after '{keyword}'." @staticmethod def extract_dirs_from_text(text): """ Extract unique directory paths from markdown links in text. Only the path part before the filename is kept. """ pattern = r'\((/[^ )]+)\s+"([^"]+)"\)' matches = re.findall(pattern, text) dirs = set() files = set() for match in matches: dirs.add(match[0]) if not match[1].startswith("."): files.add(match[1]) return dirs, files