|
import re |
|
|
|
class Preprocessor: |
|
""" |
|
Provides methods to preprocess markdown content: |
|
- extract a section after a specific keyword |
|
- extract directories from markdown links |
|
- extract file paths from markdown links |
|
""" |
|
|
|
@staticmethod |
|
def extract_section(markdown_content, keyword="Latest commit"): |
|
""" |
|
Extract lines starting from the line after the one containing |
|
'keyword' up to the next empty line. |
|
""" |
|
lines = markdown_content.splitlines() |
|
extract = [] |
|
found = False |
|
for i, line in enumerate(lines): |
|
if not found and keyword in line: |
|
found = True |
|
|
|
start_idx = i + 1 |
|
while start_idx < len(lines): |
|
next_line = lines[start_idx] |
|
if next_line.strip() == "": |
|
break |
|
extract.append(next_line) |
|
start_idx += 1 |
|
break |
|
return "\n".join(extract) if extract else f"No content found after '{keyword}'." |
|
|
|
@staticmethod |
|
def extract_dirs_from_text(text): |
|
""" |
|
Extract unique directory paths from markdown links in text. |
|
Only the path part before the filename is kept. |
|
""" |
|
pattern = r'\((/[^ )]+)\s+"([^"]+)"\)' |
|
matches = re.findall(pattern, text) |
|
dirs = set() |
|
files = set() |
|
for match in matches: |
|
dirs.add(match[0]) |
|
if not match[1].startswith("."): |
|
files.add(match[1]) |
|
return dirs, files |