AutoReadmeAgent / utils /preprocessor.py
bogeumkim's picture
Add preprocessor.py
b57f297
raw
history blame
1.63 kB
import re
class Preprocessor:
"""
Provides methods to preprocess markdown content:
- extract a section after a specific keyword
- extract directories from markdown links
- extract file paths from markdown links
"""
@staticmethod
def extract_section(markdown_content, keyword="Latest commit"):
"""
Extract lines starting from the line after the one containing
'keyword' up to the next empty line.
"""
lines = markdown_content.splitlines()
extract = []
found = False
for i, line in enumerate(lines):
if not found and keyword in line:
found = True
# Start collecting from the next line
start_idx = i + 1
while start_idx < len(lines):
next_line = lines[start_idx]
if next_line.strip() == "":
break
extract.append(next_line)
start_idx += 1
break
return "\n".join(extract) if extract else f"No content found after '{keyword}'."
@staticmethod
def extract_dirs_from_text(text):
"""
Extract unique directory paths from markdown links in text.
Only the path part before the filename is kept.
"""
pattern = r'\((/[^ )]+)\s+"([^"]+)"\)'
matches = re.findall(pattern, text)
dirs = set()
files = set()
for match in matches:
dirs.add(match[0])
if not match[1].startswith("."):
files.add(match[1])
return dirs, files