Flamehaven commited on Sep 8

Commit

d466b7d

0 Parent(s):

Initial commit: Dir2md open-core project

Browse files

Files changed (22) hide show

.devcontainer/devcontainer.json +10 -0
.github/workflows/dir2md-blueprint.yml +55 -0
.gitignore +49 -0
.pre-commit-config.yaml +9 -0
Dockerfile +6 -0
FEATURES.md +271 -0
LICENSE +21 -0
README.md +175 -0
pyproject.toml +22 -0
scripts/bench_dir2md.py +37 -0
src/dir2md/__init__.py +2 -0
src/dir2md/cli.py +109 -0
src/dir2md/core.py +249 -0
src/dir2md/gitignore.py +41 -0
src/dir2md/license.py +62 -0
src/dir2md/manifest.py +9 -0
src/dir2md/markdown.py +36 -0
src/dir2md/parallel.py +20 -0
src/dir2md/simhash.py +29 -0
src/dir2md/summary.py +32 -0
src/dir2md/token.py +5 -0
tests/test_dir2md.py +109 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "dir2md",
+  "image": "mcr.microsoft.com/devcontainers/python:3.11",
+  "postCreateCommand": "pip install -e . && pre-commit install",
+  "customizations": {
+    "vscode": {
+      "extensions": ["ms-python.python", "ms-python.vscode-pylance"]
+    }
+  }
+}

.github/workflows/dir2md-blueprint.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+name: dir2md Blueprint
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+  workflow_dispatch:
+jobs:
+  build-blueprint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dir2md
+        run: |
+          python -m pip install --upgrade pip
+          pip install .
+      - name: Generate blueprint
+        id: gen
+        run: |
+          dir2md . --capsule --emit-manifest --stats -o PROJECT_BLUEPRINT.md || true
+          TOKENS=$(jq .stats.est_tokens_prompt PROJECT_BLUEPRINT.manifest.json)
+          echo "tokens=$TOKENS" >> $GITHUB_OUTPUT
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: dir2md-blueprint
+          path: |
+            PROJECT_BLUEPRINT.md
+            PROJECT_BLUEPRINT.manifest.json
+            PROJECT_BLUEPRINT.capsule.zip
+      - name: Comment PR
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const tokens = '${{ steps.gen.outputs.tokens }}';
+            const body = [
+              '## 📦 dir2md Blueprint',
+              `- Estimated prompt tokens: **${tokens}**`,
+              '- Artifacts: _see workflow run → Artifacts_',
+              '',
+              'Run locally:',
+              '```bash',
+              'pip install .',
+              'dir2md .',
+              '```'
+            ].join('\n');
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body
+            });

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.env
+.venv/
+venv/
+.idea/
+.vscode/
+.ipynb_checkpoints/
+.dist/
+.build/
+.coverage
+.pytest_cache/
+# OS
+.DS_Store
+Thumbs.db
+# Output files - exclude test outputs but keep documentation
+*.manifest.json
+*_blueprint.md
+*_summary.md
+*_output.md
+test_output.md
+example_*.md
+pro_*.md
+raw_*.md
+secure_*.md
+masking.py
+# Keep important documentation
+!README.md
+!FEATURES.md
+!CURRENT_FEATURES.md
+!CONTRIBUTING.md
+!CHANGELOG.md
+# Virtual environment (large, not needed)
+venv_clean/
+# Additional ignores
+.dir2md_cache/
+tmp/
+temp/
+# Personal files to ignore
+ENGLISH_CONVERSION_COMPLETE.md
+USAGE_EXAMPLES.md

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+repos:
+  - repo: local
+    hooks:
+      - id: dir2md-dry-run
+        name: dir2md (dry-run)
+        entry: bash -lc 'dir2md . --preset iceberg --emit-manifest --stats --dry-run >/dev/null || true'
+        language: system
+        pass_filenames: false
+        stages: [pre-commit]

Dockerfile ADDED Viewed

	@@ -0,0 +1,6 @@

+FROM python:3.11-slim
+WORKDIR /work
+COPY . /work
+RUN pip install --no-cache-dir .
+ENTRYPOINT ["dir2md"]
+CMD ["."]

FEATURES.md ADDED Viewed

	@@ -0,0 +1,271 @@

+# Dir2md Feature Comparison: Open Source vs Pro
+> **Transform your codebase into LLM-optimized markdown blueprints**
+Dir2md follows an **Open-Core** model - providing essential functionality for free while offering advanced features for professional teams and power users.
+## 🎯 Quick Comparison
+| Feature Category | Open Source (Free) | Pro Version |
+|------------------|-------------------|-------------|
+| **Basic Functionality** | ✅ Full Access | ✅ Enhanced |
+| **Security & Masking** | ✅ Basic Patterns | ✅ Advanced + Custom |
+| **Performance** | ✅ Single-threaded | ✅ Parallel + Caching |
+| **Export Options** | ✅ Markdown Only | ✅ HTML, PDF, Slides |
+| **Team Features** | ❌ Individual Use | ✅ CI/CD Integration |
+| **Language Support** | ✅ Basic Analysis | ✅ Smart Plugins |
+---
+## 🔓 Open Source Features (MIT License)
+### Core Functionality
+- **📁 Directory Scanning**: Complete file tree analysis with `.gitignore` support
+- **🎯 Smart Filtering**: Include/exclude/omit glob patterns
+- **📊 Token Optimization**: Head/tail sampling with configurable budgets
+- **🔄 Duplicate Detection**: SimHash-based content deduplication
+- **📋 Manifest Generation**: JSON metadata with file hashes and statistics
+- **⏰ Deterministic Output**: `--no-timestamp` for reproducible builds
+- **🎨 Multiple Presets**: `iceberg`, `pro`, `raw` (default: `raw` for developers)
+### Basic Security
+- **🛡️ Essential Masking**: Protection for common secrets
+  - AWS Access Keys (`AKIA[0-9A-Z]{16}`)
+  - Bearer Tokens (`Bearer <token>`)
+  - Private Keys (`-----BEGIN ... PRIVATE KEY-----`)
+### Output Modes
+- **📝 Reference Mode**: File listings with metadata
+- **📖 Summary Mode**: Condensed content overview
+- **📄 Inline Mode**: Full content inclusion (within token budget)
+### CLI & Integration
+- **⚡ Command Line Interface**: Full-featured CLI with help system
+- **🔧 Configurable Options**: Extensive customization via arguments
+- **📦 Easy Installation**: `pip install dir2md`
+---
+## 🔒 Pro Version Features
+### Advanced Security & Compliance
+- **🛡️ Comprehensive Masking**: 25+ built-in patterns
+  - Cloud Provider Keys (AWS, Azure, GCP)
+  - API Tokens (Slack, GitHub, GitLab)
+  - Database Connections & Credentials
+  - Custom Pattern Support
+- **🔍 Smart Detection**: File-type aware masking
+- **✅ False Positive Reduction**: Context-aware pattern matching
+- **📝 Audit Logging**: Security scanning reports
+### Performance & Scale
+- **⚡ Parallel Processing**: Multi-threaded file analysis
+- **💾 Incremental Caching**: `.dir2md_cache/` for faster re-runs
+- **📈 Large Repository Support**: Optimized for 10,000+ files
+- **🚀 Streaming Processing**: Memory-efficient for massive codebases
+### Advanced Analysis
+- **🧠 Language Plugins**: Smart code analysis
+  - **Python**: AST parsing, function/class extraction
+  - **JavaScript/TypeScript**: ES module analysis, export detection
+  - **Go**: Package structure, type definitions
+  - **Java**: Class hierarchy, annotation extraction
+- **📊 Drift Detection**: Compare blueprint versions
+- **🎯 Impact Scoring**: Identify critical changes
+### Export & Sharing
+- **📄 Multiple Formats**: HTML, PDF, PowerPoint slides
+- **🎨 Custom Templates**: Branded output with Jinja2
+- **📱 Responsive HTML**: Mobile-friendly documentation
+- **🖨️ Print Optimization**: Publication-ready PDFs
+### Team & CI/CD Integration
+- **🤖 GitHub Actions**: Automated blueprint generation
+- **💬 PR Comments**: Automatic documentation updates
+- **🔗 GitLab Integration**: Pipeline integration support
+- **📋 Status Checks**: Quality gates for documentation
+- **👥 Team Templates**: Standardized output formats
+### Developer Experience
+- **🖥️ Terminal UI (TUI)**: Interactive file selection
+- **🔍 Live Preview**: Real-time output preview
+- **⚙️ Advanced Configuration**: Team-wide settings
+- **📊 Analytics Dashboard**: Usage metrics and insights
+---
+## 💰 Pricing & Licensing
+### Open Source (MIT)
+- **Price**: Free forever
+- **Use Case**: Individual developers, small projects
+- **Support**: Community via GitHub Issues
+- **License**: MIT - commercial use allowed
+### Pro Version
+- **Individual**: $29/month or $290/year
+- **Team (5 users)**: $99/month or $990/year
+- **Enterprise**: Custom pricing with on-premise options
+- **Support**: Priority email support + documentation
+- **License**: Commercial license with usage analytics opt-out
+---
+## 🚀 Usage Examples
+### Open Source Quick Start
+```bash
+# Install from PyPI
+pip install dir2md
+# Basic usage with security masking
+dir2md ./my-project --masking basic --preset raw
+# Generate with manifest for CI/CD
+dir2md . --emit-manifest --no-timestamp --output blueprint.md
+```
+### Pro Version Examples
+```bash
+# Set Pro license
+export DIR2MD_LICENSE="PRO-your-license-key"
+# Advanced masking with custom patterns
+dir2md . --masking advanced --preset pro
+# Parallel processing with caching
+dir2md ./large-repo --parallel --use-cache
+# Generate multiple formats
+dir2md . --export html,pdf --template branded
+```
+### GitHub Actions Integration
+**Open Source:**
+```yaml
+- name: Generate Blueprint
+  run: |
+    pip install dir2md
+    dir2md . --no-timestamp --output docs/blueprint.md
+```
+**Pro Version:**
+```yaml
+- name: Generate Pro Blueprint
+  env:
+    DIR2MD_LICENSE: ${{ secrets.DIR2MD_PRO_LICENSE }}
+  run: |
+    pip install dir2md-pro
+    dir2md . --masking advanced --export html --pr-comment
+```
+---
+## 🎯 When to Upgrade to Pro
+### Individual Developers
+- Working with sensitive codebases requiring advanced security
+- Need faster processing for large repositories (1000+ files)
+- Want professional-looking exports for client presentations
+- Require language-specific code analysis
+### Teams & Organizations
+- Standardizing documentation across multiple projects
+- Integrating with CI/CD pipelines for automatic updates
+- Need compliance features for security auditing
+- Want team analytics and usage insights
+### Enterprise Users
+- On-premise deployment requirements
+- SSO/SAML integration needs
+- Custom security patterns and compliance rules
+- Dedicated support and SLA requirements
+---
+## 🛠️ Technical Implementation
+### Open-Core Architecture
+```
+dir2md-core (OSS)           dir2md-pro (Commercial)
+├── CLI Interface           ├── Advanced Masking
+├── File Scanning           ├── Language Plugins
+├── Token Optimization      ├── Parallel Engine
+├── Basic Masking           ├── Export Templates
+├── Manifest Generation     ├── Team Integration
+└── Markdown Output         └── License Validation
+```
+### License Validation
+- **Runtime Check**: Environment variable `DIR2MD_LICENSE`
+- **Offline Validation**: Ed25519 signature verification
+- **Graceful Degradation**: Falls back to OSS features if invalid
+- **No Phone Home**: All validation happens locally
+### Plugin System
+```python
+# Pro Plugin Example
+class PythonAnalyzer(LanguagePlugin):
+    extensions = {'.py'}
+    def analyze(self, content: str) -> Dict[str, Any]:
+        return {
+            'functions': self.extract_functions(content),
+            'classes': self.extract_classes(content),
+            'imports': self.extract_imports(content)
+        }
+```
+---
+## 🆚 Comparison with Alternatives
+| Tool | Open Source | Pro Features | License Model |
+|------|-------------|--------------|---------------|
+| **dir2md** | ✅ Full core functionality | ✅ Advanced security, performance, team features | Open-Core (MIT + Commercial) |
+| tree + cat | ✅ Basic listing | ❌ No advanced features | Free (but manual) |
+| Proprietary doc tools | ❌ Closed source | ✅ Enterprise features | Subscription only |
+| Custom scripts | ✅ DIY solution | ❌ No standardization | Time investment |
+---
+## 📞 Get Started
+### Try Open Source
+```bash
+pip install dir2md
+dir2md --help
+```
+### Evaluate Pro Features
+```bash
+# 14-day free trial
+export DIR2MD_LICENSE="TRIAL-request-at-dir2md.com"
+pip install dir2md-pro
+dir2md --masking advanced --parallel
+```
+### Purchase Pro License
+- **Individual**: [Buy now for $29/month](https://dir2md.com/buy/individual)
+- **Team**: [Start team trial](https://dir2md.com/buy/team)
+- **Enterprise**: [Contact sales](https://dir2md.com/contact)
+---
+## 🤝 Contributing
+Dir2md's open-source core welcomes contributions:
+- **Bug Reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
+- **Feature Requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
+- **Code Contributions**: See [CONTRIBUTING.md](CONTRIBUTING.md)
+- **Documentation**: Help improve our guides and examples
+Pro features are developed in-house but benefit from community feedback and OSS improvements.
+---
+*Made with ❤️ for developers who value great documentation*

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Yoon
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,175 @@

+# Dir2md
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
+> Transform your codebase into LLM-optimized markdown blueprints
+Dir2md analyzes directory structures and generates comprehensive markdown documentation optimized for Large Language Models. It intelligently samples content, removes duplicates, and provides token-budget control to create the perfect context for AI-assisted development.
+## ✨ Key Features
+- **🎯 Smart Content Sampling**: Head/tail sampling with configurable token budgets
+- **🔄 Duplicate Detection**: SimHash-based deduplication to reduce noise
+- **🛡️ Security First**: Built-in secret masking (basic OSS, advanced Pro)
+- **📊 Multiple Output Modes**: Reference, summary, or full inline content
+- **🔧 Highly Configurable**: Extensive filtering and customization options
+- **⚡ Developer Friendly**: Raw mode default for complete code visibility
+## 🚀 Quick Start
+### Installation
+```bash
+# From source (current)
+git clone https://github.com/your-org/dir2md.git
+cd dir2md
+python -m src.dir2md.cli --help
+# Coming soon: PyPI installation
+pip install dir2md
+```
+### Basic Usage
+```bash
+# Generate project blueprint (developer-friendly raw mode)
+dir2md .
+# With basic security masking
+dir2md . --masking basic
+# Generate with manifest for CI/CD
+dir2md . --emit-manifest --no-timestamp
+# Token-optimized for LLM context
+dir2md . --budget-tokens 4000 --preset iceberg
+```
+### Output Example
+```markdown
+# Project Blueprint
+- Root: `/path/to/project`
+- Generated: `2025-09-08 12:30:15`
+- Preset: `raw`
+- LLM mode: `inline`
+- Estimated tokens (prompt): `6247`
+## Directory Tree
+[Complete file structure]
+## Statistics
+| Metric | Value |
+|--------|-------|
+| Total files | 42 |
+| Estimated tokens | 6247 |
+## File Contents
+[Intelligently sampled content...]
+```
+## 📋 Available Presets
+| Preset | Description | Best For |
+|--------|-------------|-----------|
+| `raw` | Full content inclusion | Development, code review |
+| `iceberg` | Balanced sampling | General documentation |
+| `pro` | Advanced optimization | Large projects, LLM context |
+## 🔒 Open-Core Model
+### Free (OSS) Features
+- Complete directory analysis
+- Token optimization and sampling
+- SimHash deduplication
+- Basic security masking (3 patterns)
+- All output modes and presets
+- Deterministic builds
+### Pro Features
+- Advanced security masking (9+ patterns)
+- Parallel processing & caching
+- Language-specific analysis plugins
+- HTML/PDF export options
+- Team integration (CI/CD, PR bots)
+- Priority support
+[Learn more about Pro features](FEATURES.md)
+## 📖 Documentation
+- **[Feature Comparison](FEATURES.md)** - Complete OSS vs Pro breakdown
+- **[Current Status](CURRENT_FEATURES.md)** - What's implemented now
+- **[Usage Examples](USAGE_EXAMPLES.md)** - Hands-on guide with examples
+## 🛠️ CLI Reference
+```bash
+# Basic options
+dir2md [path] -o output.md --preset [iceberg|pro|raw]
+# Token control
+--budget-tokens 6000          # Total token budget
+--max-file-tokens 1200        # Per-file token limit
+--sample-head 120             # Lines from file start
+--sample-tail 40              # Lines from file end
+# Filtering
+--include-glob "*.py,*.md"    # Include patterns
+--exclude-glob "test*,*.tmp"  # Exclude patterns
+--only-ext "py,js,ts"         # File extensions only
+# Security
+--masking [off|basic|advanced] # Secret masking level
+# Output
+--emit-manifest              # Generate JSON metadata
+--no-timestamp              # Reproducible output
+--dry-run                   # Preview without writing
+```
+## 🤝 Contributing
+We welcome contributions! Dir2md follows an open-core model:
+- **Core functionality**: Open source (this repo)
+- **Advanced features**: Commercial (separate repo)
+- **Community**: All discussions welcome
+### Development Setup
+```bash
+git clone https://github.com/your-org/dir2md.git
+cd dir2md
+python -m pytest -v  # Run tests
+python -m src.dir2md.cli . --dry-run  # Test CLI
+```
+### Reporting Issues
+- 🐛 **Bug reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
+- 💡 **Feature requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
+- 📧 **Security issues**: [email protected]
+## 📄 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+Pro features are available under a separate commercial license.
+## 🌟 Why Dir2md?
+Traditional documentation approaches fall short when working with AI assistants:
+- **Too much noise**: Raw `tree` + `cat` includes irrelevant files
+- **Token waste**: Unoptimized content hits LLM context limits
+- **Security risks**: Accidental exposure of secrets and keys
+- **No structure**: Difficult for AI to understand project layout
+Dir2md solves these problems with intelligent analysis, sampling, and optimization specifically designed for the AI era.
+---
+*Made with ❤️ for developers who want their AI to understand their code*

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dir2md"
+version = "0.0.1"
+description = "Generate a Markdown blueprint: directory tree + optional file contents (token-optimized, ICEBERG preset)"
+readme = "README.md"
+authors = [{name = "Flamehaven", email = "[email protected]"}]
+license = {text = "MIT"}
+requires-python = ">=3.9"
+dependencies = ["pathspec>=0.12.0"]
+[project.scripts]
+dir2md = "dir2md.cli:main"
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]

scripts/bench_dir2md.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from __future__ import annotations
+import time, json, argparse
+from pathlib import Path
+from dir2md.core import Config, generate_markdown_report
+def run_case(root: Path, preset: str, mode: str | None, budget: int, file_budget: int) -> dict:
+    cfg = Config(
+        root=root, output=root/"_BENCH.md", include_globs=[], exclude_globs=[], omit_globs=[],
+        respect_gitignore=True, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
+        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
+        llm_mode=(mode or "ref"), budget_tokens=budget, max_file_tokens=file_budget,
+        dedup_bits=16, sample_head=120, sample_tail=40, strip_comments=False,
+        emit_manifest=False, preset=preset, explain_capsule=True,
+    )
+    t0 = time.perf_counter()
+    md = generate_markdown_report(cfg)
+    dt = time.perf_counter() - t0
+    est = md.split("Estimated tokens (prompt): `")[-1].split("`")[0]
+    return {"preset": preset, "mode": cfg.llm_mode, "elapsed_sec": round(dt,3), "est_tokens": int(est)}
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("path", nargs="?", default=".")
+    ns = ap.parse_args()
+    root = Path(ns.path).resolve()
+    cases = [
+        ("iceberg", None, 6000, 1000),
+        ("pro", "summary", 6000, 1000),
+        ("pro", "ref", 4000, 1000),
+        ("pro", "inline", 8000, 1200),
+    ]
+    rows = [run_case(root, *c) for c in cases]
+    print(json.dumps(rows, indent=2))
+if __name__ == "__main__":
+    main()

src/dir2md/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __all__ = ["__version__"]
2	+ __version__ = "0.0.1"

src/dir2md/cli.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from __future__ import annotations
+import argparse, zipfile, hashlib
+from pathlib import Path
+from .core import Config, generate_markdown_report
+from . import __version__
+DEFAULT_EXCLUDES = [
+    ".git", "__pycache__", "node_modules", ".venv",
+    "build", "dist", "*.pyc", ".DS_Store",
+]
+def positive_int(v: str) -> int:
+    try:
+        iv = int(v)
+    except ValueError:
+        raise argparse.ArgumentTypeError("Please enter an integer value.")
+    if iv <= 0:
+        raise argparse.ArgumentTypeError("Only positive integers are allowed.")
+    return iv
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(prog="dir2md", description="Directory → Markdown exporter with LLM optimization")
+    ap.add_argument("path", nargs="?", default=".")
+    ap.add_argument("-o", "--output", default="PROJECT_BLUEPRINT.md")
+    # Preset options
+    ap.add_argument("--preset", default="raw", choices=["iceberg","pro","raw"], help="Preset mode: iceberg/pro/raw")
+    # Token and selection control
+    ap.add_argument("--llm-mode", choices=["off","ref","summary","inline"], default=None)
+    ap.add_argument("--budget-tokens", type=int, default=6000)
+    ap.add_argument("--max-file-tokens", type=int, default=1200)
+    ap.add_argument("--dedup", type=int, default=16)
+    ap.add_argument("--sample-head", type=int, default=120)
+    ap.add_argument("--sample-tail", type=int, default=40)
+    ap.add_argument("--explain", action="store_true", help="Include selection rationale and drift_score in capsule comments")
+    # Filtering and safety controls
+    ap.add_argument("--include-glob", action="append", default=[])
+    ap.add_argument("--exclude-glob", action="append", default=[])
+    ap.add_argument("--omit-glob", action="append", default=[])
+    ap.add_argument("--only-ext", default="")
+    ap.add_argument("--respect-gitignore", action="store_true")
+    ap.add_argument("--follow-symlinks", action="store_true")
+    ap.add_argument("--max-bytes", type=positive_int, default=200_000)
+    ap.add_argument("--max-lines", type=positive_int, default=2000)
+    # Output options
+    ap.add_argument("--emit-manifest", action="store_true")
+    ap.add_argument("--stats", action="store_true")
+    ap.add_argument("--capsule", action="store_true", help="Package md+manifest into zip")
+    ap.add_argument("--dry-run", action="store_true")
+    ap.add_argument("--no-timestamp", action="store_true", help="Omit timestamp for reproducible output")
+    ap.add_argument("--masking", choices=["off", "basic", "advanced"], default="off", help="Secret masking mode (advanced requires Pro license)")
+    ap.add_argument("-V", "--version", action="version", version=f"dir2md {__version__}")
+    ns = ap.parse_args(argv)
+    root = Path(ns.path).resolve()
+    output = Path(ns.output)
+    only_ext = {e.strip().lstrip('.') for e in ns.only_ext.split(',') if e.strip()} or None
+    cfg = Config(
+        root=root,
+        output=output,
+        include_globs=list(ns.include_glob),
+        exclude_globs=list(ns.exclude_glob or DEFAULT_EXCLUDES),
+        omit_globs=list(ns.omit_glob),
+        respect_gitignore=bool(ns.respect_gitignore),
+        follow_symlinks=bool(ns.follow_symlinks),
+        max_bytes=int(ns.max_bytes) if ns.max_bytes else None,
+        max_lines=int(ns.max_lines) if ns.max_lines else None,
+        include_contents=True,
+        only_ext=only_ext,
+        add_stats=bool(ns.stats or True),
+        add_toc=False,
+        llm_mode=(ns.llm_mode or "ref"),
+        budget_tokens=int(ns.budget_tokens),
+        max_file_tokens=int(ns.max_file_tokens),
+        dedup_bits=int(ns.dedup),
+        sample_head=int(ns.sample_head),
+        sample_tail=int(ns.sample_tail),
+        strip_comments=False,
+        emit_manifest=bool(ns.emit_manifest),
+        preset=str(ns.preset),
+        explain_capsule=bool(ns.explain),
+        no_timestamp=bool(ns.no_timestamp),
+        masking_mode=str(ns.masking),
+    )
+    md = generate_markdown_report(cfg)
+    if ns.dry_run:
+        h = hashlib.sha256(md.encode('utf-8')).hexdigest()[:10]
+        print(f"[dry-run] preset={cfg.preset} mode={cfg.llm_mode} est_tokens~{cfg.budget_tokens} md={h}")
+        return 0
+    output.write_text(md, encoding="utf-8")
+    if ns.capsule:
+        with zipfile.ZipFile(output.with_suffix('.capsule.zip'), 'w') as z:
+            z.write(output)
+            if cfg.emit_manifest and output.with_suffix('.manifest.json').exists():
+                z.write(output.with_suffix('.manifest.json'))
+    print(f"[dir2md] Wrote: {output}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

src/dir2md/core.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+import json
+from .gitignore import build_gitignore_matcher
+from .markdown import to_markdown
+from .simhash import simhash64, hamming
+from .summary import summarize
+from .manifest import sha256_bytes, write_manifest
+from .token import estimate_tokens
+from .masking import apply_masking
+@dataclass
+class Stats:
+    total_dirs: int = 0
+    total_files_in_tree: int = 0
+    total_omitted: int = 0
+    total_with_contents: int = 0
+    est_tokens_prompt: int = 0
+@dataclass
+class Config:
+    root: Path
+    output: Path
+    include_globs: List[str]
+    exclude_globs: List[str]
+    omit_globs: List[str]
+    respect_gitignore: bool
+    follow_symlinks: bool
+    max_bytes: Optional[int]
+    max_lines: Optional[int]
+    include_contents: bool
+    only_ext: Optional[set[str]] = None
+    add_stats: bool = True
+    add_toc: bool = False
+    # Preset/token related
+    llm_mode: str = "ref"   # off|ref|summary|inline
+    budget_tokens: int = 6000
+    max_file_tokens: int = 1200
+    dedup_bits: int = 16
+    sample_head: int = 120
+    sample_tail: int = 40
+    strip_comments: bool = False
+    emit_manifest: bool = True
+    preset: str = "iceberg"
+    explain_capsule: bool = False
+    no_timestamp: bool = False
+    masking_mode: str = "basic"
+_DEFAULT_ONLY_EXT = {"py","ts","tsx","js","jsx","md","txt","toml","yaml","yml","json", ""}
+def apply_preset(cfg: Config) -> Config:
+    try:
+        total_bytes = sum((f.stat().st_size for f in cfg.root.rglob('*') if f.is_file()))
+    except Exception:
+        total_bytes = 0
+    if cfg.preset == "iceberg":
+        cfg.respect_gitignore = True
+        if not cfg.only_ext:
+            cfg.only_ext = set(_DEFAULT_ONLY_EXT)
+        cfg.dedup_bits = 16
+        cfg.emit_manifest = True
+        # Auto-determine mode based on repository size
+        if total_bytes < 200_000:
+            cfg.llm_mode = "inline"; cfg.budget_tokens = min(cfg.budget_tokens, 6000); cfg.max_file_tokens = 1000
+        elif total_bytes < 5_000_000:
+            cfg.llm_mode = "summary"; cfg.budget_tokens = min(cfg.budget_tokens, 6000)
+        else:
+            cfg.llm_mode = "ref"; cfg.budget_tokens = min(cfg.budget_tokens, 4000)
+    elif cfg.preset == "raw":
+        cfg.llm_mode = "inline"; cfg.dedup_bits = 0; cfg.only_ext = None; cfg.emit_manifest = False
+    # pro: maintain user settings
+    return cfg
+def generate_markdown_report(cfg: Config) -> str:
+    cfg = apply_preset(cfg)
+    root = cfg.root
+    if not root.exists():
+        raise FileNotFoundError(f"Path does not exist: {root}")
+    if not root.is_dir():
+        raise NotADirectoryError(f"Path is not a directory: {root}")
+    gitignore = build_gitignore_matcher(root) if cfg.respect_gitignore else None
+    def is_ignored(p: Path) -> bool:
+        if gitignore and gitignore(str(p.relative_to(root) if p != root else "")):
+            return True
+        for pat in cfg.exclude_globs:
+            if p.match(pat) or any(part == pat for part in p.parts):
+                return True
+        return False
+    def is_omitted(p: Path) -> bool:
+        for pat in cfg.omit_globs:
+            if p.match(pat) or any(part == pat for part in p.parts):
+                return True
+        return False
+    # Tree & file collection
+    tree_lines: list[str] = [str(root)]
+    files: list[Path] = []
+    stats = Stats()  # Pre-create for accurate directory counting
+    def walk(current: Path, prefix: str = "") -> None:
+        # Count when entering directory
+        stats.total_dirs += 1
+        try:
+            entries = sorted(list(current.iterdir()), key=lambda x: (not x.is_dir(), x.name.lower()))
+        except PermissionError:
+            return
+        entries = [e for e in entries if not is_ignored(e)]
+        for i, child in enumerate(entries):
+            last = (i == len(entries)-1)
+            joint = "└── " if last else "├── "
+            tree_lines.append(f"{prefix}{joint}{child.name}")
+            if child.is_dir():
+                walk(child, prefix + ("    " if last else "│   "))
+            else:
+                files.append(child)
+    walk(root)
+    # Generate candidates + deduplication
+    candidates: list[dict] = []
+    sim_seen: list[int] = []
+    for f in files:
+        if cfg.only_ext and f.suffix.lstrip(".").lower() not in cfg.only_ext:
+            continue
+        if is_omitted(f):
+            continue
+        try:
+            raw = f.read_bytes()
+        except Exception:
+            continue
+        if cfg.max_bytes and len(raw) > cfg.max_bytes:
+            raw = raw[: cfg.max_bytes]
+        text = raw.decode("utf-8", errors="replace")
+        if cfg.masking_mode != "off":
+            text = apply_masking(text, mode=cfg.masking_mode)
+        sh = simhash64(text)
+        # Deduplication
+        if cfg.dedup_bits > 0 and any(hamming(sh, h0) <= cfg.dedup_bits for h0 in sim_seen):
+            continue
+        sim_seen.append(sh)
+        candidates.append({
+            "path": f,
+            "sha256": sha256_bytes(raw),
+            "summary": summarize(f, text, max_lines=40),
+            "text": text,
+            "simhash": sh,
+        })
+    # Apply budget + reflect mode (Explain & Drift)
+    est_total = 0
+    selected_blocks: list[tuple[Path, str, str]] = []
+    selected_hashes: list[int] = []
+    def drift_score_bits(sh: int) -> int:
+        if not selected_hashes:
+            return 64
+        return min((hamming(sh, prev) for prev in selected_hashes), default=64)
+    for rec in candidates:
+        if cfg.llm_mode == "off":
+            break
+        sh = rec["simhash"]
+        drift_bits = drift_score_bits(sh)
+        drift = round(drift_bits / 64, 3)  # 0~1, higher = fresher
+        if cfg.llm_mode == "ref":
+            meta = json.dumps({"sha256": rec["sha256"], "path": str(rec["path"]), "drift": drift}, ensure_ascii=False)
+            tok = estimate_tokens(meta) + 16
+            if est_total + tok > cfg.budget_tokens:
+                continue
+            est_total += tok
+            selected_blocks.append((rec["path"], "json", meta))
+            selected_hashes.append(sh)
+        elif cfg.llm_mode == "summary":
+            payload = rec["summary"]
+            tok = estimate_tokens(payload)
+            if est_total + tok > cfg.budget_tokens:
+                continue
+            est_total += tok
+            text = payload
+            if cfg.explain_capsule:
+                text += f"\n\n<!-- why: summary; drift={drift} -->"
+            selected_blocks.append((rec["path"], "markdown", text))
+            selected_hashes.append(sh)
+        else:  # inline
+            lines = rec["text"].splitlines()
+            if cfg.max_lines and len(lines) > cfg.max_lines:
+                lines = lines[: cfg.max_lines]
+            content = "\n".join(lines)
+            if estimate_tokens(content) > cfg.max_file_tokens:
+                head = lines[: cfg.sample_head]
+                tail = lines[-cfg.sample_tail:] if cfg.sample_tail > 0 else []
+                mid = f"\n<!-- [truncated middle: {max(0, len(lines)-len(head)-len(tail))} lines omitted] -->\n"
+                content = "\n".join(head + [mid] + tail)
+            tok = min(cfg.max_file_tokens, estimate_tokens(content))
+            if est_total + tok > cfg.budget_tokens:
+                continue
+            est_total += tok
+            if cfg.explain_capsule:
+                content += f"\n\n<!-- why: inline; drift={drift}; tok={tok} -->"
+            lang = rec["path"].suffix.lstrip(".") or "text"
+            selected_blocks.append((rec["path"], lang, content))
+            selected_hashes.append(sh)
+    # Final reflection of accumulated statistics
+    stats.total_files_in_tree = len(files)
+    stats.total_omitted = max(0, len(files) - len(selected_blocks))
+    stats.total_with_contents = len(selected_blocks)
+    stats.est_tokens_prompt = est_total
+    # Note: stats.total_dirs accumulated during walk()
+    # Manifest
+    if cfg.emit_manifest:
+        file_manifest = []
+        for (p, lang, t) in selected_blocks:
+            entry = {"path": str(p.relative_to(root)), "mode": cfg.llm_mode}
+            try:
+                # Re-read file for sha256 to ensure it's always present
+                entry["sha256"] = sha256_bytes(p.read_bytes())
+            except Exception:
+                entry["sha256"] = None
+            if lang == "json":
+                try:
+                    meta = json.loads(t)
+                    entry.update(meta) # drift, etc.
+                except Exception:
+                    pass
+            file_manifest.append(entry)
+        full_manifest = {
+            "stats": {
+                "total_dirs": stats.total_dirs,
+                "total_files_in_tree": stats.total_files_in_tree,
+                "total_omitted": stats.total_omitted,
+                "total_with_contents": stats.total_with_contents,
+                "est_tokens_prompt": stats.est_tokens_prompt,
+            },
+            "files": file_manifest
+        }
+        write_manifest(full_manifest, cfg.output.with_suffix('.manifest.json'))
+    return to_markdown(cfg, tree_lines, selected_blocks, stats)

src/dir2md/gitignore.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import List, Optional, Callable
+try:
+    from pathspec import PathSpec
+except Exception:
+    PathSpec = None  # type: ignore
+def _collect_gitignore_lines(root: Path) -> List[str]:
+    lines: List[str] = []
+    for gi in root.rglob('.gitignore'):
+        rel_dir = gi.parent.relative_to(root)
+        base = str(rel_dir).replace('\\', '/')
+        raw = gi.read_text(encoding='utf-8', errors='ignore').splitlines()
+        for ln in raw:
+            s = ln.strip()
+            if not s or s.startswith('#'):
+                continue
+            if s.startswith('/'):
+                s = s[1:]
+            if base and s:
+                s = f"{base}/{s}"
+            lines.append(s)
+    return lines
+def build_gitignore_matcher(root: Path) -> Optional[Callable[[str], bool]]:
+    if PathSpec is None:
+        return None
+    lines = _collect_gitignore_lines(root)
+    top = root / ".gitignore"
+    if top.exists():
+        lines = top.read_text(encoding='utf-8', errors='ignore').splitlines() + lines
+    if not lines:
+        return None
+    spec = PathSpec.from_lines("gitwildmatch", lines)
+    def match(relpath: str) -> bool:
+        return spec.match_file(relpath)
+    return match

src/dir2md/license.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""License and feature gating for dir2md open-core model"""
+import os
+from typing import Set
+class LicenseManager:
+    """Manages feature access based on license type"""
+    def __init__(self):
+        self.license_key = os.environ.get('DIR2MD_LICENSE', '')
+        self.is_pro = self._validate_license()
+    def _validate_license(self) -> bool:
+        """Validate license key (simplified for demo)"""
+        # In production, this would validate ed25519 signature
+        return self.license_key.startswith('PRO-') and len(self.license_key) > 10
+    def get_available_features(self) -> Set[str]:
+        """Return set of available features based on license"""
+        base_features = {
+            'basic_masking',
+            'directory_scan',
+            'gitignore_filter',
+            'token_estimation',
+            'simhash_dedup',
+            'manifest_json',
+            'deterministic_output',
+            'basic_stats'
+        }
+        if self.is_pro:
+            pro_features = {
+                'advanced_masking',
+                'language_plugins',
+                'parallel_processing',
+                'incremental_cache',
+                'drift_comparison',
+                'html_pdf_export',
+                'pr_integration',
+                'tui_interface'
+            }
+            return base_features.union(pro_features)
+        return base_features
+    def check_feature(self, feature: str) -> bool:
+        """Check if a feature is available"""
+        return feature in self.get_available_features()
+    def require_pro(self, feature: str) -> None:
+        """Raise error if pro feature is accessed without license"""
+        if not self.check_feature(feature):
+            raise LicenseError(
+                f"Feature '{feature}' requires dir2md Pro license. "
+                f"Visit https://dir2md.com/pro for more information."
+            )
+class LicenseError(Exception):
+    """Raised when trying to access pro features without license"""
+    pass
+# Global license manager instance
+license_manager = LicenseManager()

src/dir2md/manifest.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from __future__ import annotations
+from pathlib import Path
+import json, hashlib
+def sha256_bytes(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
+def write_manifest(data: dict, out: Path) -> None:
+    out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

src/dir2md/markdown.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .core import Config, Stats
+def to_markdown(cfg: 'Config', tree_lines: list[str], file_blocks: list[tuple[Path, str, str]], stats: 'Stats') -> str:
+    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    parts: list[str] = []
+    parts.append("# Project Blueprint\n")
+    parts.append(f"- Root: `{cfg.root}`  ")
+    if not cfg.no_timestamp:
+        parts.append(f"- Generated: `{ts}`  ")
+    parts.append(f"- Preset: `{cfg.preset}`  ")
+    parts.append(f"- LLM mode: `{cfg.llm_mode}`  ")
+    parts.append(f"- Estimated tokens (prompt): `{stats.est_tokens_prompt}`  ")
+    parts.append("")
+    parts.append("## Directory Tree\n")
+    parts.append("```\n" + "\n".join(tree_lines) + "\n````\n\n")
+    if cfg.llm_mode != "off" and file_blocks:
+        parts.append("## File Contents\n")
+        for path, lang, text in file_blocks:
+            rel = path.relative_to(cfg.root)
+            parts.append(f"### File: `{rel}`\n")
+            parts.append(f"```{lang}\n{text}\n\n````\n")
+    if cfg.add_stats:
+        parts.append("## Summary\n")
+        parts.append("| metric | value |\n|---|---:|")
+        parts.append(f"| dirs | {stats.total_dirs} |")
+        parts.append(f"| files in tree | {stats.total_files_in_tree} |")
+        parts.append(f"| selected files | {stats.total_with_contents} |")
+        parts.append(f"| omitted | {stats.total_omitted} |")
+        parts.append(f"| est tokens (prompt) | {stats.est_tokens_prompt} |\n")
+    return "\n".join(parts)

src/dir2md/parallel.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Parallel processing module (Pro feature)"""
+from .license import license_manager, LicenseError
+def parallel_file_processing(files, processor_func):
+    """Process files in parallel (Pro feature)"""
+    license_manager.require_pro('parallel_processing')
+    # This would contain actual parallel processing logic
+    # For demo, just show the restriction
+    from concurrent.futures import ThreadPoolExecutor
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        return list(executor.map(processor_func, files))
+def check_cache(file_path):
+    """Check if file is cached (Pro feature)"""
+    license_manager.require_pro('incremental_cache')
+    # Cache checking logic would go here
+    return False  # Simplified for demo

src/dir2md/simhash.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from __future__ import annotations
+from typing import Iterable
+import re, hashlib
+_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
+def _tokens(s: str) -> list[str]:
+    return _TOKEN_RE.findall(s.lower())
+def _shingles(seq: list[str], k: int = 4) -> Iterable[int]:
+    if k <= 0:
+        k = 4
+    for i in range(max(0, len(seq)-k+1)):
+        payload = " ".join(seq[i:i+k]).encode()
+        yield int.from_bytes(hashlib.blake2b(payload, digest_size=8).digest(), 'big')
+def simhash64(s: str, k: int = 4) -> int:
+    v = [0]*64
+    for h in _shingles(_tokens(s), k=k):
+        for bit in range(64):
+            v[bit] += 1 if (h >> bit) & 1 else -1
+    out = 0
+    for bit in range(64):
+        if v[bit] > 0:
+            out |= (1<<bit)
+    return out
+def hamming(a: int, b: int) -> int:
+    return (a ^ b).bit_count()

src/dir2md/summary.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from __future__ import annotations
+from pathlib import Path
+import ast
+def summarize(path: Path, content: str, max_lines: int = 60) -> str:
+    ext = path.suffix.lower()
+    if ext == ".py":
+        try:
+            tree = ast.parse(content)
+            funcs = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
+            clss  = [n.name for n in tree.body if isinstance(n, ast.ClassDef)]
+            imps  = []
+            for n in tree.body:
+                if isinstance(n, ast.Import):
+                    imps.extend([a.name for a in n.names])
+                if isinstance(n, ast.ImportFrom):
+                    imps.extend([a.name for a in n.names])
+            lines = []
+            if imps:  lines.append(f"- imports: {', '.join(imps)[:200]}")
+            if clss:  lines.append(f"- classes: {', '.join(clss)[:200]}")
+            if funcs: lines.append(f"- functions: {', '.join(funcs)[:200]}")
+            return "\n".join(lines) or "- (no symbols)"
+        except Exception:
+            pass
+    if ext in {".md", ".markdown"}:
+        heads = [ln.strip() for ln in content.splitlines() if ln.strip().startswith("#")][:10]
+        return "\n".join([f"- {h}" for h in heads]) or _first_lines(content, max_lines)
+    return _first_lines(content, max_lines)
+def _first_lines(content: str, max_lines: int) -> str:
+    lines = content.splitlines()[:max_lines]
+    return "\n".join([f"- {ln}" for ln in lines if ln.strip()])

src/dir2md/token.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from __future__ import annotations
+def estimate_tokens(text: str) -> int:
+    # Simple estimation: 4 chars ≈ 1 token
+    return max(1, (len(text) + 3)//4)

tests/test_dir2md.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from __future__ import annotations
+import json, tempfile
+from pathlib import Path
+from dir2md.core import Config, generate_markdown_report
+def _make_repo(tmp: Path) -> Path:
+    (tmp/"src").mkdir(parents=True, exist_ok=True)
+    # Make this file long enough to trigger truncation
+    long_content = "\n".join([f"    print('line {i}')" for i in range(100)])
+    (tmp/"src"/"a.py").write_text(f"""
+import os
+class A: pass
+def foo():
+{long_content}
+    return 42
+""", encoding="utf-8")
+    (tmp/"src"/"b.py").write_text("""
+import sys
+def bar():
+    return 43
+""", encoding="utf-8")
+    # Similar file (for deduplication testing)
+    (tmp/"src"/"b_copy.py").write_text((tmp/"src"/"b.py").read_text(encoding="utf-8"), encoding="utf-8")
+    (tmp/"README.md").write_text("# Title\n\nSome text\n", encoding="utf-8")
+    return tmp
+def test_budget_and_modes(tmp_path: Path):
+    root = _make_repo(tmp_path)
+    cfg = Config(
+        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
+        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
+        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
+        llm_mode="summary", budget_tokens=200, max_file_tokens=1200, dedup_bits=16,
+        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
+        preset="pro", explain_capsule=True,
+    )
+    md = generate_markdown_report(cfg)
+    assert "Estimated tokens (prompt):" in md
+    mpath = (root/"OUT.manifest.json")
+    assert mpath.exists()
+    man = json.loads(mpath.read_text(encoding="utf-8"))
+    # b_copy.py likely to be excluded due to deduplication
+    paths = {entry["path"] for entry in man["files"]}
+    assert any(p.endswith("a.py") for p in paths)
+    assert any(p.endswith("b.py") for p in paths)
+def test_ref_mode_manifest(tmp_path: Path):
+    root = _make_repo(tmp_path)
+    cfg = Config(
+        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
+        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
+        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
+        llm_mode="ref", budget_tokens=120, max_file_tokens=1200, dedup_bits=16,
+        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
+        preset="pro", explain_capsule=False,
+    )
+    md = generate_markdown_report(cfg)
+    man = json.loads((root/"OUT.manifest.json").read_text(encoding="utf-8"))
+    assert "stats" in man
+    assert "files" in man
+    assert all("sha256" in e for e in man["files"])
+def test_inline_sampling(tmp_path: Path):
+    root = _make_repo(tmp_path)
+    # Drastically reduced budget to trigger sampling
+    cfg = Config(
+        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
+        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=50,
+        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
+        llm_mode="inline", budget_tokens=50, max_file_tokens=30, dedup_bits=0,
+        sample_head=5, sample_tail=3, strip_comments=False, emit_manifest=False,
+        preset="pro", explain_capsule=True,
+    )
+    md = generate_markdown_report(cfg)
+    assert "truncated middle" in md
+    assert "why: inline" in md
+def test_masking(tmp_path: Path):
+    root = _make_repo(tmp_path)
+    # Add a file with a secret
+    secret_content = "My AWS key is AKIAIOSFODNN7EXAMPLE"
+    (root / ".env").write_text(secret_content, encoding="utf-8")
+    cfg = Config(
+        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
+        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
+        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
+        llm_mode="inline", budget_tokens=1000, max_file_tokens=1000, dedup_bits=0,
+        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=False,
+        preset="pro", explain_capsule=False, no_timestamp=True,
+        masking_mode="basic",
+    )
+    md = generate_markdown_report(cfg)
+    assert secret_content not in md
+    assert "[*** MASKED_SECRET ***]" in md
+    # Test with masking off
+    cfg.masking_mode = "off"
+    md_unmasked = generate_markdown_report(cfg)
+    assert secret_content in md_unmasked
+    assert "[*** MASKED_SECRET ***]" not in md_unmasked