Spaces:

Agents-MCP-Hackathon
/

AutoReadmeAgent

Running

AutoReadmeAgent / utils /preprocessor.py

Add preprocessor.py

b57f297 2 months ago

1.63 kB

	import re

	class Preprocessor:
	"""
	Provides methods to preprocess markdown content:
	- extract a section after a specific keyword
	- extract directories from markdown links
	- extract file paths from markdown links
	"""

	@staticmethod
	def extract_section(markdown_content, keyword="Latest commit"):
	"""
	Extract lines starting from the line after the one containing
	'keyword' up to the next empty line.
	"""
	lines = markdown_content.splitlines()
	extract = []
	found = False
	for i, line in enumerate(lines):
	if not found and keyword in line:
	found = True
	# Start collecting from the next line
	start_idx = i + 1
	while start_idx < len(lines):
	next_line = lines[start_idx]
	if next_line.strip() == "":
	break
	extract.append(next_line)
	start_idx += 1
	break
	return "\n".join(extract) if extract else f"No content found after '{keyword}'."

	@staticmethod
	def extract_dirs_from_text(text):
	"""
	Extract unique directory paths from markdown links in text.
	Only the path part before the filename is kept.
	"""
	pattern = r'\((/[^ )]+)\s+"([^"]+)"\)'
	matches = re.findall(pattern, text)
	dirs = set()
	files = set()
	for match in matches:
	dirs.add(match[0])
	if not match[1].startswith("."):
	files.add(match[1])
	return dirs, files