|
import os |
|
import re |
|
from pathlib import Path |
|
import yaml |
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent |
|
|
|
|
|
KEYWORD_TAGS = [ |
|
"CCore", "CShell", "REPL", "Mesh", "Agent", "HMP", |
|
"MeshConsensus", "CogSync", "GMP", "EGP", |
|
"Ethics", "Scenarios", "JSON" |
|
] |
|
|
|
ROOT_DIR = Path(".") |
|
STRUCTURED_DIR = ROOT_DIR / "structured_md" |
|
INDEX_FILE = STRUCTURED_DIR / "index.md" |
|
|
|
MD_EXT = ".md" |
|
|
|
|
|
JSON_LD_TEMPLATES = { |
|
"FAQ": """\n```json |
|
{{ |
|
"@context": "https://schema.org", |
|
"@type": "FAQPage", |
|
"mainEntity": {main_entity} |
|
}} |
|
```\n""", |
|
"HowTo": """\n```json |
|
{{ |
|
"@context": "https://schema.org", |
|
"@type": "HowTo", |
|
"name": "{title}", |
|
"description": "{description}", |
|
"step": {steps} |
|
}} |
|
```\n""", |
|
"Article": """\n```json |
|
{{ |
|
"@context": "https://schema.org", |
|
"@type": "Article", |
|
"name": "{title}", |
|
"description": "{description}" |
|
}} |
|
```\n""" |
|
} |
|
|
|
FRONT_MATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL) |
|
|
|
def is_md_file(path): |
|
return path.suffix.lower() == MD_EXT and STRUCTURED_DIR not in path.parents |
|
|
|
def extract_front_matter(content: str): |
|
"""Возвращает (front_matter_dict, clean_content) — без YAML-шапки.""" |
|
match = FRONT_MATTER_RE.match(content) |
|
if match: |
|
try: |
|
data = yaml.safe_load(match.group(1)) or {} |
|
except Exception: |
|
data = {} |
|
clean = content[match.end():] |
|
return data, clean |
|
return {}, content |
|
|
|
def detect_file_type(content: str, front_matter: dict | None = None) -> str: |
|
"""Определяет тип: FAQ / HowTo / Article (по front-matter или заголовкам).""" |
|
front_matter = front_matter or {} |
|
if "type" in front_matter: |
|
return front_matter["type"] |
|
|
|
|
|
if re.search(r"^#\s*FAQ\b", content, re.MULTILINE) or re.search(r"^##\s*Q&A\b", content, re.MULTILINE): |
|
return "FAQ" |
|
if re.search(r"^#\s*HowTo\b", content, re.MULTILINE) or re.search(r"^#\s*Как\s+сделать\b", content, re.IGNORECASE | re.MULTILINE): |
|
return "HowTo" |
|
return "Article" |
|
|
|
def parse_front_matter(content): |
|
match = FRONT_MATTER_RE.match(content) |
|
if match: |
|
try: |
|
data = yaml.safe_load(match.group(1)) |
|
return data |
|
except Exception: |
|
pass |
|
return {} |
|
|
|
def determine_type(content, front_matter): |
|
if "type" in front_matter: |
|
return front_matter["type"] |
|
|
|
if re.search(r"^#.*FAQ", content, re.MULTILINE): |
|
return "FAQ" |
|
if re.search(r"^#.*HowTo", content, re.MULTILINE): |
|
return "HowTo" |
|
return "Article" |
|
|
|
def generate_json_ld(content, front_matter, ftype, title, rel_path): |
|
desc = front_matter.get("description", content[:100].replace("\n", " ") + "...") |
|
url = f"structured_md/{rel_path.as_posix()}" |
|
|
|
if ftype == "FAQ": |
|
q_matches = re.findall(r"^##\s*(.+)$", content, re.MULTILINE) |
|
main_entity = [] |
|
for q in q_matches: |
|
ans_match = re.search(rf"##\s*{re.escape(q)}\s*\n(.+?)(\n##|\Z)", content, re.DOTALL) |
|
answer_text = ans_match.group(1).strip() if ans_match else "" |
|
main_entity.append({ |
|
"@type": "Question", |
|
"name": q, |
|
"acceptedAnswer": {"@type": "Answer", "text": answer_text} |
|
}) |
|
import json |
|
return JSON_LD_TEMPLATES["FAQ"].format( |
|
main_entity=json.dumps(main_entity, ensure_ascii=False, indent=2) |
|
).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
|
elif ftype == "HowTo": |
|
steps = [{"@type": "HowToStep", "name": s.strip()} for s in re.findall(r"^- (.+)$", content, re.MULTILINE)] |
|
import json |
|
return JSON_LD_TEMPLATES["HowTo"].format( |
|
title=title, description=desc, steps=json.dumps(steps, ensure_ascii=False, indent=2) |
|
).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
|
else: |
|
return JSON_LD_TEMPLATES["Article"].format( |
|
title=title, description=desc |
|
).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
|
def add_index_link(content, file_path): |
|
|
|
rel_path = os.path.relpath(STRUCTURED_DIR / "index.md", file_path.parent) |
|
link_line = f"\n\n---\n> ⚡ [AI friendly version docs (structured_md)]({rel_path})\n" |
|
if link_line.strip() not in content: |
|
content += link_line |
|
return content |
|
|
|
def extract_tags(content, existing_tags): |
|
tags = set(existing_tags or []) |
|
for kw in KEYWORD_TAGS: |
|
if kw.lower() in content.lower(): |
|
tags.add(kw) |
|
return list(tags) |
|
|
|
def mirror_md_files(): |
|
processed = [] |
|
for path in REPO_ROOT.rglob("*.md"): |
|
if "structured_md" in path.parts or path.name.lower() == "index.md": |
|
continue |
|
|
|
rel_path = path.relative_to(REPO_ROOT) |
|
target_path = STRUCTURED_DIR / rel_path |
|
target_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
with path.open("r", encoding="utf-8") as f: |
|
content = f.read() |
|
|
|
front_matter, clean_content = extract_front_matter(content) |
|
ftype = detect_file_type(clean_content, front_matter) |
|
|
|
|
|
h1_match = re.search(r"^#\s*(.+)$", clean_content, re.MULTILINE) |
|
if h1_match: |
|
title = h1_match.group(1).strip() |
|
rest_content = clean_content[h1_match.end():].strip() |
|
description = front_matter.get("description", rest_content[:200].replace("\n", " ") + "...") |
|
else: |
|
title = front_matter.get("title", path.stem) |
|
description = front_matter.get("description", clean_content[:200].replace("\n", " ") + "...") |
|
|
|
tags = extract_tags(clean_content, front_matter.get("tags", [])) |
|
|
|
|
|
fm_dict = { |
|
"title": title, |
|
"description": description, |
|
"type": ftype, |
|
"tags": tags, |
|
} |
|
yaml_fm = "---\n" + yaml.safe_dump(fm_dict, sort_keys=False, allow_unicode=True) + "---\n\n" |
|
|
|
|
|
clean_content = add_index_link(clean_content, target_path) |
|
|
|
|
|
json_ld = generate_json_ld(clean_content, front_matter, ftype, title, rel_path) |
|
|
|
|
|
with target_path.open("w", encoding="utf-8") as f: |
|
f.write(yaml_fm) |
|
f.write(clean_content.rstrip()) |
|
f.write("\n\n") |
|
f.write(json_ld) |
|
|
|
processed.append(rel_path) |
|
|
|
return processed |
|
|
|
def generate_index(files): |
|
index_lines = ["# ИИ-дружелюбные версии файлов\n"] |
|
tree = {} |
|
|
|
for f in files: |
|
parts = list(f.parts) |
|
d = tree |
|
for p in parts[:-1]: |
|
d = d.setdefault(p, {}) |
|
d[parts[-1]] = None |
|
|
|
def render_tree(d, parent_path="", level=0): |
|
lines = [] |
|
for name, sub in sorted(d.items()): |
|
indent = " " * level |
|
full_path = Path(parent_path) / name |
|
if sub is None: |
|
lines.append(f"{indent}- [{name}]({full_path.as_posix()})") |
|
else: |
|
lines.append(f"{indent}- {name}") |
|
lines.extend(render_tree(sub, full_path, level + 1)) |
|
return lines |
|
|
|
index_lines.extend(render_tree(tree)) |
|
|
|
INDEX_FILE.parent.mkdir(parents=True, exist_ok=True) |
|
with open(INDEX_FILE, "w", encoding="utf-8") as f: |
|
f.write("\n".join(index_lines)) |
|
|
|
if __name__ == "__main__": |
|
STRUCTURED_DIR.mkdir(exist_ok=True) |
|
md_files = mirror_md_files() |
|
generate_index(md_files) |
|
print(f"Обработано {len(md_files)} файлов. Индекс создан: {INDEX_FILE}") |
|
|