File size: 1,632 Bytes
b57f297 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import re
class Preprocessor:
"""
Provides methods to preprocess markdown content:
- extract a section after a specific keyword
- extract directories from markdown links
- extract file paths from markdown links
"""
@staticmethod
def extract_section(markdown_content, keyword="Latest commit"):
"""
Extract lines starting from the line after the one containing
'keyword' up to the next empty line.
"""
lines = markdown_content.splitlines()
extract = []
found = False
for i, line in enumerate(lines):
if not found and keyword in line:
found = True
# Start collecting from the next line
start_idx = i + 1
while start_idx < len(lines):
next_line = lines[start_idx]
if next_line.strip() == "":
break
extract.append(next_line)
start_idx += 1
break
return "\n".join(extract) if extract else f"No content found after '{keyword}'."
@staticmethod
def extract_dirs_from_text(text):
"""
Extract unique directory paths from markdown links in text.
Only the path part before the filename is kept.
"""
pattern = r'\((/[^ )]+)\s+"([^"]+)"\)'
matches = re.findall(pattern, text)
dirs = set()
files = set()
for match in matches:
dirs.add(match[0])
if not match[1].startswith("."):
files.add(match[1])
return dirs, files |