Commit
·
d466b7d
0
Parent(s):
Initial commit: Dir2md open-core project
Browse files- .devcontainer/devcontainer.json +10 -0
- .github/workflows/dir2md-blueprint.yml +55 -0
- .gitignore +49 -0
- .pre-commit-config.yaml +9 -0
- Dockerfile +6 -0
- FEATURES.md +271 -0
- LICENSE +21 -0
- README.md +175 -0
- pyproject.toml +22 -0
- scripts/bench_dir2md.py +37 -0
- src/dir2md/__init__.py +2 -0
- src/dir2md/cli.py +109 -0
- src/dir2md/core.py +249 -0
- src/dir2md/gitignore.py +41 -0
- src/dir2md/license.py +62 -0
- src/dir2md/manifest.py +9 -0
- src/dir2md/markdown.py +36 -0
- src/dir2md/parallel.py +20 -0
- src/dir2md/simhash.py +29 -0
- src/dir2md/summary.py +32 -0
- src/dir2md/token.py +5 -0
- tests/test_dir2md.py +109 -0
.devcontainer/devcontainer.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "dir2md",
|
| 3 |
+
"image": "mcr.microsoft.com/devcontainers/python:3.11",
|
| 4 |
+
"postCreateCommand": "pip install -e . && pre-commit install",
|
| 5 |
+
"customizations": {
|
| 6 |
+
"vscode": {
|
| 7 |
+
"extensions": ["ms-python.python", "ms-python.vscode-pylance"]
|
| 8 |
+
}
|
| 9 |
+
}
|
| 10 |
+
}
|
.github/workflows/dir2md-blueprint.yml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: dir2md Blueprint
|
| 2 |
+
on:
|
| 3 |
+
pull_request:
|
| 4 |
+
types: [opened, synchronize, reopened]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
build-blueprint:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- uses: actions/checkout@v4
|
| 12 |
+
- uses: actions/setup-python@v5
|
| 13 |
+
with:
|
| 14 |
+
python-version: '3.11'
|
| 15 |
+
- name: Install dir2md
|
| 16 |
+
run: |
|
| 17 |
+
python -m pip install --upgrade pip
|
| 18 |
+
pip install .
|
| 19 |
+
- name: Generate blueprint
|
| 20 |
+
id: gen
|
| 21 |
+
run: |
|
| 22 |
+
dir2md . --capsule --emit-manifest --stats -o PROJECT_BLUEPRINT.md || true
|
| 23 |
+
TOKENS=$(jq .stats.est_tokens_prompt PROJECT_BLUEPRINT.manifest.json)
|
| 24 |
+
echo "tokens=$TOKENS" >> $GITHUB_OUTPUT
|
| 25 |
+
- name: Upload artifact
|
| 26 |
+
uses: actions/upload-artifact@v4
|
| 27 |
+
with:
|
| 28 |
+
name: dir2md-blueprint
|
| 29 |
+
path: |
|
| 30 |
+
PROJECT_BLUEPRINT.md
|
| 31 |
+
PROJECT_BLUEPRINT.manifest.json
|
| 32 |
+
PROJECT_BLUEPRINT.capsule.zip
|
| 33 |
+
- name: Comment PR
|
| 34 |
+
if: github.event_name == 'pull_request'
|
| 35 |
+
uses: actions/github-script@v7
|
| 36 |
+
with:
|
| 37 |
+
script: |
|
| 38 |
+
const tokens = '${{ steps.gen.outputs.tokens }}';
|
| 39 |
+
const body = [
|
| 40 |
+
'## 📦 dir2md Blueprint',
|
| 41 |
+
`- Estimated prompt tokens: **${tokens}**`,
|
| 42 |
+
'- Artifacts: _see workflow run → Artifacts_',
|
| 43 |
+
'',
|
| 44 |
+
'Run locally:',
|
| 45 |
+
'```bash',
|
| 46 |
+
'pip install .',
|
| 47 |
+
'dir2md .',
|
| 48 |
+
'```'
|
| 49 |
+
].join('\n');
|
| 50 |
+
github.rest.issues.createComment({
|
| 51 |
+
owner: context.repo.owner,
|
| 52 |
+
repo: context.repo.repo,
|
| 53 |
+
issue_number: context.issue.number,
|
| 54 |
+
body
|
| 55 |
+
});
|
.gitignore
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.env
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
.idea/
|
| 9 |
+
.vscode/
|
| 10 |
+
.ipynb_checkpoints/
|
| 11 |
+
.dist/
|
| 12 |
+
.build/
|
| 13 |
+
.coverage
|
| 14 |
+
.pytest_cache/
|
| 15 |
+
|
| 16 |
+
# OS
|
| 17 |
+
.DS_Store
|
| 18 |
+
Thumbs.db
|
| 19 |
+
|
| 20 |
+
# Output files - exclude test outputs but keep documentation
|
| 21 |
+
*.manifest.json
|
| 22 |
+
*_blueprint.md
|
| 23 |
+
*_summary.md
|
| 24 |
+
*_output.md
|
| 25 |
+
test_output.md
|
| 26 |
+
example_*.md
|
| 27 |
+
pro_*.md
|
| 28 |
+
raw_*.md
|
| 29 |
+
secure_*.md
|
| 30 |
+
masking.py
|
| 31 |
+
|
| 32 |
+
# Keep important documentation
|
| 33 |
+
!README.md
|
| 34 |
+
!FEATURES.md
|
| 35 |
+
!CURRENT_FEATURES.md
|
| 36 |
+
!CONTRIBUTING.md
|
| 37 |
+
!CHANGELOG.md
|
| 38 |
+
|
| 39 |
+
# Virtual environment (large, not needed)
|
| 40 |
+
venv_clean/
|
| 41 |
+
|
| 42 |
+
# Additional ignores
|
| 43 |
+
.dir2md_cache/
|
| 44 |
+
tmp/
|
| 45 |
+
temp/
|
| 46 |
+
|
| 47 |
+
# Personal files to ignore
|
| 48 |
+
ENGLISH_CONVERSION_COMPLETE.md
|
| 49 |
+
USAGE_EXAMPLES.md
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: local
|
| 3 |
+
hooks:
|
| 4 |
+
- id: dir2md-dry-run
|
| 5 |
+
name: dir2md (dry-run)
|
| 6 |
+
entry: bash -lc 'dir2md . --preset iceberg --emit-manifest --stats --dry-run >/dev/null || true'
|
| 7 |
+
language: system
|
| 8 |
+
pass_filenames: false
|
| 9 |
+
stages: [pre-commit]
|
Dockerfile
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
WORKDIR /work
|
| 3 |
+
COPY . /work
|
| 4 |
+
RUN pip install --no-cache-dir .
|
| 5 |
+
ENTRYPOINT ["dir2md"]
|
| 6 |
+
CMD ["."]
|
FEATURES.md
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dir2md Feature Comparison: Open Source vs Pro
|
| 2 |
+
|
| 3 |
+
> **Transform your codebase into LLM-optimized markdown blueprints**
|
| 4 |
+
|
| 5 |
+
Dir2md follows an **Open-Core** model - providing essential functionality for free while offering advanced features for professional teams and power users.
|
| 6 |
+
|
| 7 |
+
## 🎯 Quick Comparison
|
| 8 |
+
|
| 9 |
+
| Feature Category | Open Source (Free) | Pro Version |
|
| 10 |
+
|------------------|-------------------|-------------|
|
| 11 |
+
| **Basic Functionality** | ✅ Full Access | ✅ Enhanced |
|
| 12 |
+
| **Security & Masking** | ✅ Basic Patterns | ✅ Advanced + Custom |
|
| 13 |
+
| **Performance** | ✅ Single-threaded | ✅ Parallel + Caching |
|
| 14 |
+
| **Export Options** | ✅ Markdown Only | ✅ HTML, PDF, Slides |
|
| 15 |
+
| **Team Features** | ❌ Individual Use | ✅ CI/CD Integration |
|
| 16 |
+
| **Language Support** | ✅ Basic Analysis | ✅ Smart Plugins |
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## 🔓 Open Source Features (MIT License)
|
| 21 |
+
|
| 22 |
+
### Core Functionality
|
| 23 |
+
- **📁 Directory Scanning**: Complete file tree analysis with `.gitignore` support
|
| 24 |
+
- **🎯 Smart Filtering**: Include/exclude/omit glob patterns
|
| 25 |
+
- **📊 Token Optimization**: Head/tail sampling with configurable budgets
|
| 26 |
+
- **🔄 Duplicate Detection**: SimHash-based content deduplication
|
| 27 |
+
- **📋 Manifest Generation**: JSON metadata with file hashes and statistics
|
| 28 |
+
- **⏰ Deterministic Output**: `--no-timestamp` for reproducible builds
|
| 29 |
+
- **🎨 Multiple Presets**: `iceberg`, `pro`, `raw` (default: `raw` for developers)
|
| 30 |
+
|
| 31 |
+
### Basic Security
|
| 32 |
+
- **🛡️ Essential Masking**: Protection for common secrets
|
| 33 |
+
- AWS Access Keys (`AKIA[0-9A-Z]{16}`)
|
| 34 |
+
- Bearer Tokens (`Bearer <token>`)
|
| 35 |
+
- Private Keys (`-----BEGIN ... PRIVATE KEY-----`)
|
| 36 |
+
|
| 37 |
+
### Output Modes
|
| 38 |
+
- **📝 Reference Mode**: File listings with metadata
|
| 39 |
+
- **📖 Summary Mode**: Condensed content overview
|
| 40 |
+
- **📄 Inline Mode**: Full content inclusion (within token budget)
|
| 41 |
+
|
| 42 |
+
### CLI & Integration
|
| 43 |
+
- **⚡ Command Line Interface**: Full-featured CLI with help system
|
| 44 |
+
- **🔧 Configurable Options**: Extensive customization via arguments
|
| 45 |
+
- **📦 Easy Installation**: `pip install dir2md`
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## 🔒 Pro Version Features
|
| 50 |
+
|
| 51 |
+
### Advanced Security & Compliance
|
| 52 |
+
- **🛡️ Comprehensive Masking**: 25+ built-in patterns
|
| 53 |
+
- Cloud Provider Keys (AWS, Azure, GCP)
|
| 54 |
+
- API Tokens (Slack, GitHub, GitLab)
|
| 55 |
+
- Database Connections & Credentials
|
| 56 |
+
- Custom Pattern Support
|
| 57 |
+
- **🔍 Smart Detection**: File-type aware masking
|
| 58 |
+
- **✅ False Positive Reduction**: Context-aware pattern matching
|
| 59 |
+
- **📝 Audit Logging**: Security scanning reports
|
| 60 |
+
|
| 61 |
+
### Performance & Scale
|
| 62 |
+
- **⚡ Parallel Processing**: Multi-threaded file analysis
|
| 63 |
+
- **💾 Incremental Caching**: `.dir2md_cache/` for faster re-runs
|
| 64 |
+
- **📈 Large Repository Support**: Optimized for 10,000+ files
|
| 65 |
+
- **🚀 Streaming Processing**: Memory-efficient for massive codebases
|
| 66 |
+
|
| 67 |
+
### Advanced Analysis
|
| 68 |
+
- **🧠 Language Plugins**: Smart code analysis
|
| 69 |
+
- **Python**: AST parsing, function/class extraction
|
| 70 |
+
- **JavaScript/TypeScript**: ES module analysis, export detection
|
| 71 |
+
- **Go**: Package structure, type definitions
|
| 72 |
+
- **Java**: Class hierarchy, annotation extraction
|
| 73 |
+
- **📊 Drift Detection**: Compare blueprint versions
|
| 74 |
+
- **🎯 Impact Scoring**: Identify critical changes
|
| 75 |
+
|
| 76 |
+
### Export & Sharing
|
| 77 |
+
- **📄 Multiple Formats**: HTML, PDF, PowerPoint slides
|
| 78 |
+
- **🎨 Custom Templates**: Branded output with Jinja2
|
| 79 |
+
- **📱 Responsive HTML**: Mobile-friendly documentation
|
| 80 |
+
- **🖨️ Print Optimization**: Publication-ready PDFs
|
| 81 |
+
|
| 82 |
+
### Team & CI/CD Integration
|
| 83 |
+
- **🤖 GitHub Actions**: Automated blueprint generation
|
| 84 |
+
- **💬 PR Comments**: Automatic documentation updates
|
| 85 |
+
- **🔗 GitLab Integration**: Pipeline integration support
|
| 86 |
+
- **📋 Status Checks**: Quality gates for documentation
|
| 87 |
+
- **👥 Team Templates**: Standardized output formats
|
| 88 |
+
|
| 89 |
+
### Developer Experience
|
| 90 |
+
- **🖥️ Terminal UI (TUI)**: Interactive file selection
|
| 91 |
+
- **🔍 Live Preview**: Real-time output preview
|
| 92 |
+
- **⚙️ Advanced Configuration**: Team-wide settings
|
| 93 |
+
- **📊 Analytics Dashboard**: Usage metrics and insights
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## 💰 Pricing & Licensing
|
| 98 |
+
|
| 99 |
+
### Open Source (MIT)
|
| 100 |
+
- **Price**: Free forever
|
| 101 |
+
- **Use Case**: Individual developers, small projects
|
| 102 |
+
- **Support**: Community via GitHub Issues
|
| 103 |
+
- **License**: MIT - commercial use allowed
|
| 104 |
+
|
| 105 |
+
### Pro Version
|
| 106 |
+
- **Individual**: $29/month or $290/year
|
| 107 |
+
- **Team (5 users)**: $99/month or $990/year
|
| 108 |
+
- **Enterprise**: Custom pricing with on-premise options
|
| 109 |
+
- **Support**: Priority email support + documentation
|
| 110 |
+
- **License**: Commercial license with usage analytics opt-out
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## 🚀 Usage Examples
|
| 115 |
+
|
| 116 |
+
### Open Source Quick Start
|
| 117 |
+
|
| 118 |
+
```bash
|
| 119 |
+
# Install from PyPI
|
| 120 |
+
pip install dir2md
|
| 121 |
+
|
| 122 |
+
# Basic usage with security masking
|
| 123 |
+
dir2md ./my-project --masking basic --preset raw
|
| 124 |
+
|
| 125 |
+
# Generate with manifest for CI/CD
|
| 126 |
+
dir2md . --emit-manifest --no-timestamp --output blueprint.md
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### Pro Version Examples
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
# Set Pro license
|
| 133 |
+
export DIR2MD_LICENSE="PRO-your-license-key"
|
| 134 |
+
|
| 135 |
+
# Advanced masking with custom patterns
|
| 136 |
+
dir2md . --masking advanced --preset pro
|
| 137 |
+
|
| 138 |
+
# Parallel processing with caching
|
| 139 |
+
dir2md ./large-repo --parallel --use-cache
|
| 140 |
+
|
| 141 |
+
# Generate multiple formats
|
| 142 |
+
dir2md . --export html,pdf --template branded
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
### GitHub Actions Integration
|
| 146 |
+
|
| 147 |
+
**Open Source:**
|
| 148 |
+
```yaml
|
| 149 |
+
- name: Generate Blueprint
|
| 150 |
+
run: |
|
| 151 |
+
pip install dir2md
|
| 152 |
+
dir2md . --no-timestamp --output docs/blueprint.md
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
**Pro Version:**
|
| 156 |
+
```yaml
|
| 157 |
+
- name: Generate Pro Blueprint
|
| 158 |
+
env:
|
| 159 |
+
DIR2MD_LICENSE: ${{ secrets.DIR2MD_PRO_LICENSE }}
|
| 160 |
+
run: |
|
| 161 |
+
pip install dir2md-pro
|
| 162 |
+
dir2md . --masking advanced --export html --pr-comment
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## 🎯 When to Upgrade to Pro
|
| 168 |
+
|
| 169 |
+
### Individual Developers
|
| 170 |
+
- Working with sensitive codebases requiring advanced security
|
| 171 |
+
- Need faster processing for large repositories (1000+ files)
|
| 172 |
+
- Want professional-looking exports for client presentations
|
| 173 |
+
- Require language-specific code analysis
|
| 174 |
+
|
| 175 |
+
### Teams & Organizations
|
| 176 |
+
- Standardizing documentation across multiple projects
|
| 177 |
+
- Integrating with CI/CD pipelines for automatic updates
|
| 178 |
+
- Need compliance features for security auditing
|
| 179 |
+
- Want team analytics and usage insights
|
| 180 |
+
|
| 181 |
+
### Enterprise Users
|
| 182 |
+
- On-premise deployment requirements
|
| 183 |
+
- SSO/SAML integration needs
|
| 184 |
+
- Custom security patterns and compliance rules
|
| 185 |
+
- Dedicated support and SLA requirements
|
| 186 |
+
|
| 187 |
+
---
|
| 188 |
+
|
| 189 |
+
## 🛠️ Technical Implementation
|
| 190 |
+
|
| 191 |
+
### Open-Core Architecture
|
| 192 |
+
```
|
| 193 |
+
dir2md-core (OSS) dir2md-pro (Commercial)
|
| 194 |
+
├── CLI Interface ├── Advanced Masking
|
| 195 |
+
├── File Scanning ├── Language Plugins
|
| 196 |
+
├── Token Optimization ├── Parallel Engine
|
| 197 |
+
├── Basic Masking ├── Export Templates
|
| 198 |
+
├── Manifest Generation ├── Team Integration
|
| 199 |
+
└── Markdown Output └── License Validation
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
### License Validation
|
| 203 |
+
- **Runtime Check**: Environment variable `DIR2MD_LICENSE`
|
| 204 |
+
- **Offline Validation**: Ed25519 signature verification
|
| 205 |
+
- **Graceful Degradation**: Falls back to OSS features if invalid
|
| 206 |
+
- **No Phone Home**: All validation happens locally
|
| 207 |
+
|
| 208 |
+
### Plugin System
|
| 209 |
+
```python
|
| 210 |
+
# Pro Plugin Example
|
| 211 |
+
class PythonAnalyzer(LanguagePlugin):
|
| 212 |
+
extensions = {'.py'}
|
| 213 |
+
|
| 214 |
+
def analyze(self, content: str) -> Dict[str, Any]:
|
| 215 |
+
return {
|
| 216 |
+
'functions': self.extract_functions(content),
|
| 217 |
+
'classes': self.extract_classes(content),
|
| 218 |
+
'imports': self.extract_imports(content)
|
| 219 |
+
}
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
|
| 224 |
+
## 🆚 Comparison with Alternatives
|
| 225 |
+
|
| 226 |
+
| Tool | Open Source | Pro Features | License Model |
|
| 227 |
+
|------|-------------|--------------|---------------|
|
| 228 |
+
| **dir2md** | ✅ Full core functionality | ✅ Advanced security, performance, team features | Open-Core (MIT + Commercial) |
|
| 229 |
+
| tree + cat | ✅ Basic listing | ❌ No advanced features | Free (but manual) |
|
| 230 |
+
| Proprietary doc tools | ❌ Closed source | ✅ Enterprise features | Subscription only |
|
| 231 |
+
| Custom scripts | ✅ DIY solution | ❌ No standardization | Time investment |
|
| 232 |
+
|
| 233 |
+
---
|
| 234 |
+
|
| 235 |
+
## 📞 Get Started
|
| 236 |
+
|
| 237 |
+
### Try Open Source
|
| 238 |
+
```bash
|
| 239 |
+
pip install dir2md
|
| 240 |
+
dir2md --help
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### Evaluate Pro Features
|
| 244 |
+
```bash
|
| 245 |
+
# 14-day free trial
|
| 246 |
+
export DIR2MD_LICENSE="TRIAL-request-at-dir2md.com"
|
| 247 |
+
pip install dir2md-pro
|
| 248 |
+
dir2md --masking advanced --parallel
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
### Purchase Pro License
|
| 252 |
+
- **Individual**: [Buy now for $29/month](https://dir2md.com/buy/individual)
|
| 253 |
+
- **Team**: [Start team trial](https://dir2md.com/buy/team)
|
| 254 |
+
- **Enterprise**: [Contact sales](https://dir2md.com/contact)
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## 🤝 Contributing
|
| 259 |
+
|
| 260 |
+
Dir2md's open-source core welcomes contributions:
|
| 261 |
+
|
| 262 |
+
- **Bug Reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
|
| 263 |
+
- **Feature Requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
|
| 264 |
+
- **Code Contributions**: See [CONTRIBUTING.md](CONTRIBUTING.md)
|
| 265 |
+
- **Documentation**: Help improve our guides and examples
|
| 266 |
+
|
| 267 |
+
Pro features are developed in-house but benefit from community feedback and OSS improvements.
|
| 268 |
+
|
| 269 |
+
---
|
| 270 |
+
|
| 271 |
+
*Made with ❤️ for developers who value great documentation*
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 Yoon
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dir2md
|
| 2 |
+
|
| 3 |
+
[](https://opensource.org/licenses/MIT)
|
| 4 |
+
[](https://www.python.org/downloads/)
|
| 5 |
+
|
| 6 |
+
> Transform your codebase into LLM-optimized markdown blueprints
|
| 7 |
+
|
| 8 |
+
Dir2md analyzes directory structures and generates comprehensive markdown documentation optimized for Large Language Models. It intelligently samples content, removes duplicates, and provides token-budget control to create the perfect context for AI-assisted development.
|
| 9 |
+
|
| 10 |
+
## ✨ Key Features
|
| 11 |
+
|
| 12 |
+
- **🎯 Smart Content Sampling**: Head/tail sampling with configurable token budgets
|
| 13 |
+
- **🔄 Duplicate Detection**: SimHash-based deduplication to reduce noise
|
| 14 |
+
- **🛡️ Security First**: Built-in secret masking (basic OSS, advanced Pro)
|
| 15 |
+
- **📊 Multiple Output Modes**: Reference, summary, or full inline content
|
| 16 |
+
- **🔧 Highly Configurable**: Extensive filtering and customization options
|
| 17 |
+
- **⚡ Developer Friendly**: Raw mode default for complete code visibility
|
| 18 |
+
|
| 19 |
+
## 🚀 Quick Start
|
| 20 |
+
|
| 21 |
+
### Installation
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
# From source (current)
|
| 25 |
+
git clone https://github.com/your-org/dir2md.git
|
| 26 |
+
cd dir2md
|
| 27 |
+
python -m src.dir2md.cli --help
|
| 28 |
+
|
| 29 |
+
# Coming soon: PyPI installation
|
| 30 |
+
pip install dir2md
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
### Basic Usage
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
# Generate project blueprint (developer-friendly raw mode)
|
| 37 |
+
dir2md .
|
| 38 |
+
|
| 39 |
+
# With basic security masking
|
| 40 |
+
dir2md . --masking basic
|
| 41 |
+
|
| 42 |
+
# Generate with manifest for CI/CD
|
| 43 |
+
dir2md . --emit-manifest --no-timestamp
|
| 44 |
+
|
| 45 |
+
# Token-optimized for LLM context
|
| 46 |
+
dir2md . --budget-tokens 4000 --preset iceberg
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### Output Example
|
| 50 |
+
|
| 51 |
+
```markdown
|
| 52 |
+
# Project Blueprint
|
| 53 |
+
|
| 54 |
+
- Root: `/path/to/project`
|
| 55 |
+
- Generated: `2025-09-08 12:30:15`
|
| 56 |
+
- Preset: `raw`
|
| 57 |
+
- LLM mode: `inline`
|
| 58 |
+
- Estimated tokens (prompt): `6247`
|
| 59 |
+
|
| 60 |
+
## Directory Tree
|
| 61 |
+
[Complete file structure]
|
| 62 |
+
|
| 63 |
+
## Statistics
|
| 64 |
+
| Metric | Value |
|
| 65 |
+
|--------|-------|
|
| 66 |
+
| Total files | 42 |
|
| 67 |
+
| Estimated tokens | 6247 |
|
| 68 |
+
|
| 69 |
+
## File Contents
|
| 70 |
+
[Intelligently sampled content...]
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
## 📋 Available Presets
|
| 74 |
+
|
| 75 |
+
| Preset | Description | Best For |
|
| 76 |
+
|--------|-------------|-----------|
|
| 77 |
+
| `raw` | Full content inclusion | Development, code review |
|
| 78 |
+
| `iceberg` | Balanced sampling | General documentation |
|
| 79 |
+
| `pro` | Advanced optimization | Large projects, LLM context |
|
| 80 |
+
|
| 81 |
+
## 🔒 Open-Core Model
|
| 82 |
+
|
| 83 |
+
### Free (OSS) Features
|
| 84 |
+
- Complete directory analysis
|
| 85 |
+
- Token optimization and sampling
|
| 86 |
+
- SimHash deduplication
|
| 87 |
+
- Basic security masking (3 patterns)
|
| 88 |
+
- All output modes and presets
|
| 89 |
+
- Deterministic builds
|
| 90 |
+
|
| 91 |
+
### Pro Features
|
| 92 |
+
- Advanced security masking (9+ patterns)
|
| 93 |
+
- Parallel processing & caching
|
| 94 |
+
- Language-specific analysis plugins
|
| 95 |
+
- HTML/PDF export options
|
| 96 |
+
- Team integration (CI/CD, PR bots)
|
| 97 |
+
- Priority support
|
| 98 |
+
|
| 99 |
+
[Learn more about Pro features](FEATURES.md)
|
| 100 |
+
|
| 101 |
+
## 📖 Documentation
|
| 102 |
+
|
| 103 |
+
- **[Feature Comparison](FEATURES.md)** - Complete OSS vs Pro breakdown
|
| 104 |
+
- **[Current Status](CURRENT_FEATURES.md)** - What's implemented now
|
| 105 |
+
- **[Usage Examples](USAGE_EXAMPLES.md)** - Hands-on guide with examples
|
| 106 |
+
|
| 107 |
+
## 🛠️ CLI Reference
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
# Basic options
|
| 111 |
+
dir2md [path] -o output.md --preset [iceberg|pro|raw]
|
| 112 |
+
|
| 113 |
+
# Token control
|
| 114 |
+
--budget-tokens 6000 # Total token budget
|
| 115 |
+
--max-file-tokens 1200 # Per-file token limit
|
| 116 |
+
--sample-head 120 # Lines from file start
|
| 117 |
+
--sample-tail 40 # Lines from file end
|
| 118 |
+
|
| 119 |
+
# Filtering
|
| 120 |
+
--include-glob "*.py,*.md" # Include patterns
|
| 121 |
+
--exclude-glob "test*,*.tmp" # Exclude patterns
|
| 122 |
+
--only-ext "py,js,ts" # File extensions only
|
| 123 |
+
|
| 124 |
+
# Security
|
| 125 |
+
--masking [off|basic|advanced] # Secret masking level
|
| 126 |
+
|
| 127 |
+
# Output
|
| 128 |
+
--emit-manifest # Generate JSON metadata
|
| 129 |
+
--no-timestamp # Reproducible output
|
| 130 |
+
--dry-run # Preview without writing
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## 🤝 Contributing
|
| 134 |
+
|
| 135 |
+
We welcome contributions! Dir2md follows an open-core model:
|
| 136 |
+
|
| 137 |
+
- **Core functionality**: Open source (this repo)
|
| 138 |
+
- **Advanced features**: Commercial (separate repo)
|
| 139 |
+
- **Community**: All discussions welcome
|
| 140 |
+
|
| 141 |
+
### Development Setup
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
git clone https://github.com/your-org/dir2md.git
|
| 145 |
+
cd dir2md
|
| 146 |
+
python -m pytest -v # Run tests
|
| 147 |
+
python -m src.dir2md.cli . --dry-run # Test CLI
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
### Reporting Issues
|
| 151 |
+
|
| 152 |
+
- 🐛 **Bug reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
|
| 153 |
+
- 💡 **Feature requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
|
| 154 |
+
- 📧 **Security issues**: [email protected]
|
| 155 |
+
|
| 156 |
+
## 📄 License
|
| 157 |
+
|
| 158 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
| 159 |
+
|
| 160 |
+
Pro features are available under a separate commercial license.
|
| 161 |
+
|
| 162 |
+
## 🌟 Why Dir2md?
|
| 163 |
+
|
| 164 |
+
Traditional documentation approaches fall short when working with AI assistants:
|
| 165 |
+
|
| 166 |
+
- **Too much noise**: Raw `tree` + `cat` includes irrelevant files
|
| 167 |
+
- **Token waste**: Unoptimized content hits LLM context limits
|
| 168 |
+
- **Security risks**: Accidental exposure of secrets and keys
|
| 169 |
+
- **No structure**: Difficult for AI to understand project layout
|
| 170 |
+
|
| 171 |
+
Dir2md solves these problems with intelligent analysis, sampling, and optimization specifically designed for the AI era.
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
*Made with ❤️ for developers who want their AI to understand their code*
|
pyproject.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "dir2md"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Generate a Markdown blueprint: directory tree + optional file contents (token-optimized, ICEBERG preset)"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
authors = [{name = "Flamehaven", email = "[email protected]"}]
|
| 11 |
+
license = {text = "MIT"}
|
| 12 |
+
requires-python = ">=3.9"
|
| 13 |
+
dependencies = ["pathspec>=0.12.0"]
|
| 14 |
+
|
| 15 |
+
[project.scripts]
|
| 16 |
+
dir2md = "dir2md.cli:main"
|
| 17 |
+
|
| 18 |
+
[tool.setuptools]
|
| 19 |
+
package-dir = {"" = "src"}
|
| 20 |
+
|
| 21 |
+
[tool.setuptools.packages.find]
|
| 22 |
+
where = ["src"]
|
scripts/bench_dir2md.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import time, json, argparse
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from dir2md.core import Config, generate_markdown_report
|
| 5 |
+
|
| 6 |
+
def run_case(root: Path, preset: str, mode: str | None, budget: int, file_budget: int) -> dict:
|
| 7 |
+
cfg = Config(
|
| 8 |
+
root=root, output=root/"_BENCH.md", include_globs=[], exclude_globs=[], omit_globs=[],
|
| 9 |
+
respect_gitignore=True, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
|
| 10 |
+
include_contents=True, only_ext=None, add_stats=True, add_toc=False,
|
| 11 |
+
llm_mode=(mode or "ref"), budget_tokens=budget, max_file_tokens=file_budget,
|
| 12 |
+
dedup_bits=16, sample_head=120, sample_tail=40, strip_comments=False,
|
| 13 |
+
emit_manifest=False, preset=preset, explain_capsule=True,
|
| 14 |
+
)
|
| 15 |
+
t0 = time.perf_counter()
|
| 16 |
+
md = generate_markdown_report(cfg)
|
| 17 |
+
dt = time.perf_counter() - t0
|
| 18 |
+
est = md.split("Estimated tokens (prompt): `")[-1].split("`")[0]
|
| 19 |
+
return {"preset": preset, "mode": cfg.llm_mode, "elapsed_sec": round(dt,3), "est_tokens": int(est)}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def main():
|
| 23 |
+
ap = argparse.ArgumentParser()
|
| 24 |
+
ap.add_argument("path", nargs="?", default=".")
|
| 25 |
+
ns = ap.parse_args()
|
| 26 |
+
root = Path(ns.path).resolve()
|
| 27 |
+
cases = [
|
| 28 |
+
("iceberg", None, 6000, 1000),
|
| 29 |
+
("pro", "summary", 6000, 1000),
|
| 30 |
+
("pro", "ref", 4000, 1000),
|
| 31 |
+
("pro", "inline", 8000, 1200),
|
| 32 |
+
]
|
| 33 |
+
rows = [run_case(root, *c) for c in cases]
|
| 34 |
+
print(json.dumps(rows, indent=2))
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
main()
|
src/dir2md/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = ["__version__"]
|
| 2 |
+
__version__ = "0.0.1"
|
src/dir2md/cli.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import argparse, zipfile, hashlib
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from .core import Config, generate_markdown_report
|
| 5 |
+
from . import __version__
|
| 6 |
+
|
| 7 |
+
DEFAULT_EXCLUDES = [
|
| 8 |
+
".git", "__pycache__", "node_modules", ".venv",
|
| 9 |
+
"build", "dist", "*.pyc", ".DS_Store",
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
def positive_int(v: str) -> int:
|
| 13 |
+
try:
|
| 14 |
+
iv = int(v)
|
| 15 |
+
except ValueError:
|
| 16 |
+
raise argparse.ArgumentTypeError("Please enter an integer value.")
|
| 17 |
+
if iv <= 0:
|
| 18 |
+
raise argparse.ArgumentTypeError("Only positive integers are allowed.")
|
| 19 |
+
return iv
|
| 20 |
+
|
| 21 |
+
def main(argv: list[str] | None = None) -> int:
|
| 22 |
+
ap = argparse.ArgumentParser(prog="dir2md", description="Directory → Markdown exporter with LLM optimization")
|
| 23 |
+
ap.add_argument("path", nargs="?", default=".")
|
| 24 |
+
ap.add_argument("-o", "--output", default="PROJECT_BLUEPRINT.md")
|
| 25 |
+
|
| 26 |
+
# Preset options
|
| 27 |
+
ap.add_argument("--preset", default="raw", choices=["iceberg","pro","raw"], help="Preset mode: iceberg/pro/raw")
|
| 28 |
+
|
| 29 |
+
# Token and selection control
|
| 30 |
+
ap.add_argument("--llm-mode", choices=["off","ref","summary","inline"], default=None)
|
| 31 |
+
ap.add_argument("--budget-tokens", type=int, default=6000)
|
| 32 |
+
ap.add_argument("--max-file-tokens", type=int, default=1200)
|
| 33 |
+
ap.add_argument("--dedup", type=int, default=16)
|
| 34 |
+
ap.add_argument("--sample-head", type=int, default=120)
|
| 35 |
+
ap.add_argument("--sample-tail", type=int, default=40)
|
| 36 |
+
ap.add_argument("--explain", action="store_true", help="Include selection rationale and drift_score in capsule comments")
|
| 37 |
+
|
| 38 |
+
# Filtering and safety controls
|
| 39 |
+
ap.add_argument("--include-glob", action="append", default=[])
|
| 40 |
+
ap.add_argument("--exclude-glob", action="append", default=[])
|
| 41 |
+
ap.add_argument("--omit-glob", action="append", default=[])
|
| 42 |
+
ap.add_argument("--only-ext", default="")
|
| 43 |
+
ap.add_argument("--respect-gitignore", action="store_true")
|
| 44 |
+
ap.add_argument("--follow-symlinks", action="store_true")
|
| 45 |
+
ap.add_argument("--max-bytes", type=positive_int, default=200_000)
|
| 46 |
+
ap.add_argument("--max-lines", type=positive_int, default=2000)
|
| 47 |
+
|
| 48 |
+
# Output options
|
| 49 |
+
ap.add_argument("--emit-manifest", action="store_true")
|
| 50 |
+
ap.add_argument("--stats", action="store_true")
|
| 51 |
+
ap.add_argument("--capsule", action="store_true", help="Package md+manifest into zip")
|
| 52 |
+
ap.add_argument("--dry-run", action="store_true")
|
| 53 |
+
ap.add_argument("--no-timestamp", action="store_true", help="Omit timestamp for reproducible output")
|
| 54 |
+
ap.add_argument("--masking", choices=["off", "basic", "advanced"], default="off", help="Secret masking mode (advanced requires Pro license)")
|
| 55 |
+
|
| 56 |
+
ap.add_argument("-V", "--version", action="version", version=f"dir2md {__version__}")
|
| 57 |
+
|
| 58 |
+
ns = ap.parse_args(argv)
|
| 59 |
+
|
| 60 |
+
root = Path(ns.path).resolve()
|
| 61 |
+
output = Path(ns.output)
|
| 62 |
+
only_ext = {e.strip().lstrip('.') for e in ns.only_ext.split(',') if e.strip()} or None
|
| 63 |
+
|
| 64 |
+
cfg = Config(
|
| 65 |
+
root=root,
|
| 66 |
+
output=output,
|
| 67 |
+
include_globs=list(ns.include_glob),
|
| 68 |
+
exclude_globs=list(ns.exclude_glob or DEFAULT_EXCLUDES),
|
| 69 |
+
omit_globs=list(ns.omit_glob),
|
| 70 |
+
respect_gitignore=bool(ns.respect_gitignore),
|
| 71 |
+
follow_symlinks=bool(ns.follow_symlinks),
|
| 72 |
+
max_bytes=int(ns.max_bytes) if ns.max_bytes else None,
|
| 73 |
+
max_lines=int(ns.max_lines) if ns.max_lines else None,
|
| 74 |
+
include_contents=True,
|
| 75 |
+
only_ext=only_ext,
|
| 76 |
+
add_stats=bool(ns.stats or True),
|
| 77 |
+
add_toc=False,
|
| 78 |
+
llm_mode=(ns.llm_mode or "ref"),
|
| 79 |
+
budget_tokens=int(ns.budget_tokens),
|
| 80 |
+
max_file_tokens=int(ns.max_file_tokens),
|
| 81 |
+
dedup_bits=int(ns.dedup),
|
| 82 |
+
sample_head=int(ns.sample_head),
|
| 83 |
+
sample_tail=int(ns.sample_tail),
|
| 84 |
+
strip_comments=False,
|
| 85 |
+
emit_manifest=bool(ns.emit_manifest),
|
| 86 |
+
preset=str(ns.preset),
|
| 87 |
+
explain_capsule=bool(ns.explain),
|
| 88 |
+
no_timestamp=bool(ns.no_timestamp),
|
| 89 |
+
masking_mode=str(ns.masking),
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
md = generate_markdown_report(cfg)
|
| 93 |
+
|
| 94 |
+
if ns.dry_run:
|
| 95 |
+
h = hashlib.sha256(md.encode('utf-8')).hexdigest()[:10]
|
| 96 |
+
print(f"[dry-run] preset={cfg.preset} mode={cfg.llm_mode} est_tokens~{cfg.budget_tokens} md={h}")
|
| 97 |
+
return 0
|
| 98 |
+
|
| 99 |
+
output.write_text(md, encoding="utf-8")
|
| 100 |
+
if ns.capsule:
|
| 101 |
+
with zipfile.ZipFile(output.with_suffix('.capsule.zip'), 'w') as z:
|
| 102 |
+
z.write(output)
|
| 103 |
+
if cfg.emit_manifest and output.with_suffix('.manifest.json').exists():
|
| 104 |
+
z.write(output.with_suffix('.manifest.json'))
|
| 105 |
+
print(f"[dir2md] Wrote: {output}")
|
| 106 |
+
return 0
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
raise SystemExit(main())
|
src/dir2md/core.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
from .gitignore import build_gitignore_matcher
|
| 8 |
+
from .markdown import to_markdown
|
| 9 |
+
from .simhash import simhash64, hamming
|
| 10 |
+
from .summary import summarize
|
| 11 |
+
from .manifest import sha256_bytes, write_manifest
|
| 12 |
+
from .token import estimate_tokens
|
| 13 |
+
from .masking import apply_masking
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class Stats:
|
| 17 |
+
total_dirs: int = 0
|
| 18 |
+
total_files_in_tree: int = 0
|
| 19 |
+
total_omitted: int = 0
|
| 20 |
+
total_with_contents: int = 0
|
| 21 |
+
est_tokens_prompt: int = 0
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class Config:
|
| 25 |
+
root: Path
|
| 26 |
+
output: Path
|
| 27 |
+
include_globs: List[str]
|
| 28 |
+
exclude_globs: List[str]
|
| 29 |
+
omit_globs: List[str]
|
| 30 |
+
respect_gitignore: bool
|
| 31 |
+
follow_symlinks: bool
|
| 32 |
+
max_bytes: Optional[int]
|
| 33 |
+
max_lines: Optional[int]
|
| 34 |
+
include_contents: bool
|
| 35 |
+
only_ext: Optional[set[str]] = None
|
| 36 |
+
add_stats: bool = True
|
| 37 |
+
add_toc: bool = False
|
| 38 |
+
# Preset/token related
|
| 39 |
+
llm_mode: str = "ref" # off|ref|summary|inline
|
| 40 |
+
budget_tokens: int = 6000
|
| 41 |
+
max_file_tokens: int = 1200
|
| 42 |
+
dedup_bits: int = 16
|
| 43 |
+
sample_head: int = 120
|
| 44 |
+
sample_tail: int = 40
|
| 45 |
+
strip_comments: bool = False
|
| 46 |
+
emit_manifest: bool = True
|
| 47 |
+
preset: str = "iceberg"
|
| 48 |
+
explain_capsule: bool = False
|
| 49 |
+
no_timestamp: bool = False
|
| 50 |
+
masking_mode: str = "basic"
|
| 51 |
+
|
| 52 |
+
_DEFAULT_ONLY_EXT = {"py","ts","tsx","js","jsx","md","txt","toml","yaml","yml","json", ""}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def apply_preset(cfg: Config) -> Config:
|
| 56 |
+
try:
|
| 57 |
+
total_bytes = sum((f.stat().st_size for f in cfg.root.rglob('*') if f.is_file()))
|
| 58 |
+
except Exception:
|
| 59 |
+
total_bytes = 0
|
| 60 |
+
if cfg.preset == "iceberg":
|
| 61 |
+
cfg.respect_gitignore = True
|
| 62 |
+
if not cfg.only_ext:
|
| 63 |
+
cfg.only_ext = set(_DEFAULT_ONLY_EXT)
|
| 64 |
+
cfg.dedup_bits = 16
|
| 65 |
+
cfg.emit_manifest = True
|
| 66 |
+
# Auto-determine mode based on repository size
|
| 67 |
+
if total_bytes < 200_000:
|
| 68 |
+
cfg.llm_mode = "inline"; cfg.budget_tokens = min(cfg.budget_tokens, 6000); cfg.max_file_tokens = 1000
|
| 69 |
+
elif total_bytes < 5_000_000:
|
| 70 |
+
cfg.llm_mode = "summary"; cfg.budget_tokens = min(cfg.budget_tokens, 6000)
|
| 71 |
+
else:
|
| 72 |
+
cfg.llm_mode = "ref"; cfg.budget_tokens = min(cfg.budget_tokens, 4000)
|
| 73 |
+
elif cfg.preset == "raw":
|
| 74 |
+
cfg.llm_mode = "inline"; cfg.dedup_bits = 0; cfg.only_ext = None; cfg.emit_manifest = False
|
| 75 |
+
# pro: maintain user settings
|
| 76 |
+
return cfg
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def generate_markdown_report(cfg: Config) -> str:
|
| 80 |
+
cfg = apply_preset(cfg)
|
| 81 |
+
root = cfg.root
|
| 82 |
+
if not root.exists():
|
| 83 |
+
raise FileNotFoundError(f"Path does not exist: {root}")
|
| 84 |
+
if not root.is_dir():
|
| 85 |
+
raise NotADirectoryError(f"Path is not a directory: {root}")
|
| 86 |
+
|
| 87 |
+
gitignore = build_gitignore_matcher(root) if cfg.respect_gitignore else None
|
| 88 |
+
|
| 89 |
+
def is_ignored(p: Path) -> bool:
|
| 90 |
+
if gitignore and gitignore(str(p.relative_to(root) if p != root else "")):
|
| 91 |
+
return True
|
| 92 |
+
for pat in cfg.exclude_globs:
|
| 93 |
+
if p.match(pat) or any(part == pat for part in p.parts):
|
| 94 |
+
return True
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
def is_omitted(p: Path) -> bool:
|
| 98 |
+
for pat in cfg.omit_globs:
|
| 99 |
+
if p.match(pat) or any(part == pat for part in p.parts):
|
| 100 |
+
return True
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
# Tree & file collection
|
| 104 |
+
tree_lines: list[str] = [str(root)]
|
| 105 |
+
files: list[Path] = []
|
| 106 |
+
stats = Stats() # Pre-create for accurate directory counting
|
| 107 |
+
|
| 108 |
+
def walk(current: Path, prefix: str = "") -> None:
|
| 109 |
+
# Count when entering directory
|
| 110 |
+
stats.total_dirs += 1
|
| 111 |
+
try:
|
| 112 |
+
entries = sorted(list(current.iterdir()), key=lambda x: (not x.is_dir(), x.name.lower()))
|
| 113 |
+
except PermissionError:
|
| 114 |
+
return
|
| 115 |
+
entries = [e for e in entries if not is_ignored(e)]
|
| 116 |
+
for i, child in enumerate(entries):
|
| 117 |
+
last = (i == len(entries)-1)
|
| 118 |
+
joint = "└── " if last else "├── "
|
| 119 |
+
tree_lines.append(f"{prefix}{joint}{child.name}")
|
| 120 |
+
if child.is_dir():
|
| 121 |
+
walk(child, prefix + (" " if last else "│ "))
|
| 122 |
+
else:
|
| 123 |
+
files.append(child)
|
| 124 |
+
|
| 125 |
+
walk(root)
|
| 126 |
+
|
| 127 |
+
# Generate candidates + deduplication
|
| 128 |
+
candidates: list[dict] = []
|
| 129 |
+
sim_seen: list[int] = []
|
| 130 |
+
for f in files:
|
| 131 |
+
if cfg.only_ext and f.suffix.lstrip(".").lower() not in cfg.only_ext:
|
| 132 |
+
continue
|
| 133 |
+
if is_omitted(f):
|
| 134 |
+
continue
|
| 135 |
+
try:
|
| 136 |
+
raw = f.read_bytes()
|
| 137 |
+
except Exception:
|
| 138 |
+
continue
|
| 139 |
+
if cfg.max_bytes and len(raw) > cfg.max_bytes:
|
| 140 |
+
raw = raw[: cfg.max_bytes]
|
| 141 |
+
text = raw.decode("utf-8", errors="replace")
|
| 142 |
+
if cfg.masking_mode != "off":
|
| 143 |
+
text = apply_masking(text, mode=cfg.masking_mode)
|
| 144 |
+
sh = simhash64(text)
|
| 145 |
+
# Deduplication
|
| 146 |
+
if cfg.dedup_bits > 0 and any(hamming(sh, h0) <= cfg.dedup_bits for h0 in sim_seen):
|
| 147 |
+
continue
|
| 148 |
+
sim_seen.append(sh)
|
| 149 |
+
candidates.append({
|
| 150 |
+
"path": f,
|
| 151 |
+
"sha256": sha256_bytes(raw),
|
| 152 |
+
"summary": summarize(f, text, max_lines=40),
|
| 153 |
+
"text": text,
|
| 154 |
+
"simhash": sh,
|
| 155 |
+
})
|
| 156 |
+
|
| 157 |
+
# Apply budget + reflect mode (Explain & Drift)
|
| 158 |
+
est_total = 0
|
| 159 |
+
selected_blocks: list[tuple[Path, str, str]] = []
|
| 160 |
+
selected_hashes: list[int] = []
|
| 161 |
+
def drift_score_bits(sh: int) -> int:
|
| 162 |
+
if not selected_hashes:
|
| 163 |
+
return 64
|
| 164 |
+
return min((hamming(sh, prev) for prev in selected_hashes), default=64)
|
| 165 |
+
|
| 166 |
+
for rec in candidates:
|
| 167 |
+
if cfg.llm_mode == "off":
|
| 168 |
+
break
|
| 169 |
+
sh = rec["simhash"]
|
| 170 |
+
drift_bits = drift_score_bits(sh)
|
| 171 |
+
drift = round(drift_bits / 64, 3) # 0~1, higher = fresher
|
| 172 |
+
if cfg.llm_mode == "ref":
|
| 173 |
+
meta = json.dumps({"sha256": rec["sha256"], "path": str(rec["path"]), "drift": drift}, ensure_ascii=False)
|
| 174 |
+
tok = estimate_tokens(meta) + 16
|
| 175 |
+
if est_total + tok > cfg.budget_tokens:
|
| 176 |
+
continue
|
| 177 |
+
est_total += tok
|
| 178 |
+
selected_blocks.append((rec["path"], "json", meta))
|
| 179 |
+
selected_hashes.append(sh)
|
| 180 |
+
elif cfg.llm_mode == "summary":
|
| 181 |
+
payload = rec["summary"]
|
| 182 |
+
tok = estimate_tokens(payload)
|
| 183 |
+
if est_total + tok > cfg.budget_tokens:
|
| 184 |
+
continue
|
| 185 |
+
est_total += tok
|
| 186 |
+
text = payload
|
| 187 |
+
if cfg.explain_capsule:
|
| 188 |
+
text += f"\n\n<!-- why: summary; drift={drift} -->"
|
| 189 |
+
selected_blocks.append((rec["path"], "markdown", text))
|
| 190 |
+
selected_hashes.append(sh)
|
| 191 |
+
else: # inline
|
| 192 |
+
lines = rec["text"].splitlines()
|
| 193 |
+
if cfg.max_lines and len(lines) > cfg.max_lines:
|
| 194 |
+
lines = lines[: cfg.max_lines]
|
| 195 |
+
content = "\n".join(lines)
|
| 196 |
+
if estimate_tokens(content) > cfg.max_file_tokens:
|
| 197 |
+
head = lines[: cfg.sample_head]
|
| 198 |
+
tail = lines[-cfg.sample_tail:] if cfg.sample_tail > 0 else []
|
| 199 |
+
mid = f"\n<!-- [truncated middle: {max(0, len(lines)-len(head)-len(tail))} lines omitted] -->\n"
|
| 200 |
+
content = "\n".join(head + [mid] + tail)
|
| 201 |
+
tok = min(cfg.max_file_tokens, estimate_tokens(content))
|
| 202 |
+
if est_total + tok > cfg.budget_tokens:
|
| 203 |
+
continue
|
| 204 |
+
est_total += tok
|
| 205 |
+
if cfg.explain_capsule:
|
| 206 |
+
content += f"\n\n<!-- why: inline; drift={drift}; tok={tok} -->"
|
| 207 |
+
lang = rec["path"].suffix.lstrip(".") or "text"
|
| 208 |
+
selected_blocks.append((rec["path"], lang, content))
|
| 209 |
+
selected_hashes.append(sh)
|
| 210 |
+
|
| 211 |
+
# Final reflection of accumulated statistics
|
| 212 |
+
stats.total_files_in_tree = len(files)
|
| 213 |
+
stats.total_omitted = max(0, len(files) - len(selected_blocks))
|
| 214 |
+
stats.total_with_contents = len(selected_blocks)
|
| 215 |
+
stats.est_tokens_prompt = est_total
|
| 216 |
+
# Note: stats.total_dirs accumulated during walk()
|
| 217 |
+
|
| 218 |
+
# Manifest
|
| 219 |
+
if cfg.emit_manifest:
|
| 220 |
+
file_manifest = []
|
| 221 |
+
for (p, lang, t) in selected_blocks:
|
| 222 |
+
entry = {"path": str(p.relative_to(root)), "mode": cfg.llm_mode}
|
| 223 |
+
try:
|
| 224 |
+
# Re-read file for sha256 to ensure it's always present
|
| 225 |
+
entry["sha256"] = sha256_bytes(p.read_bytes())
|
| 226 |
+
except Exception:
|
| 227 |
+
entry["sha256"] = None
|
| 228 |
+
|
| 229 |
+
if lang == "json":
|
| 230 |
+
try:
|
| 231 |
+
meta = json.loads(t)
|
| 232 |
+
entry.update(meta) # drift, etc.
|
| 233 |
+
except Exception:
|
| 234 |
+
pass
|
| 235 |
+
file_manifest.append(entry)
|
| 236 |
+
|
| 237 |
+
full_manifest = {
|
| 238 |
+
"stats": {
|
| 239 |
+
"total_dirs": stats.total_dirs,
|
| 240 |
+
"total_files_in_tree": stats.total_files_in_tree,
|
| 241 |
+
"total_omitted": stats.total_omitted,
|
| 242 |
+
"total_with_contents": stats.total_with_contents,
|
| 243 |
+
"est_tokens_prompt": stats.est_tokens_prompt,
|
| 244 |
+
},
|
| 245 |
+
"files": file_manifest
|
| 246 |
+
}
|
| 247 |
+
write_manifest(full_manifest, cfg.output.with_suffix('.manifest.json'))
|
| 248 |
+
|
| 249 |
+
return to_markdown(cfg, tree_lines, selected_blocks, stats)
|
src/dir2md/gitignore.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import List, Optional, Callable
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from pathspec import PathSpec
|
| 7 |
+
except Exception:
|
| 8 |
+
PathSpec = None # type: ignore
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _collect_gitignore_lines(root: Path) -> List[str]:
|
| 12 |
+
lines: List[str] = []
|
| 13 |
+
for gi in root.rglob('.gitignore'):
|
| 14 |
+
rel_dir = gi.parent.relative_to(root)
|
| 15 |
+
base = str(rel_dir).replace('\\', '/')
|
| 16 |
+
raw = gi.read_text(encoding='utf-8', errors='ignore').splitlines()
|
| 17 |
+
for ln in raw:
|
| 18 |
+
s = ln.strip()
|
| 19 |
+
if not s or s.startswith('#'):
|
| 20 |
+
continue
|
| 21 |
+
if s.startswith('/'):
|
| 22 |
+
s = s[1:]
|
| 23 |
+
if base and s:
|
| 24 |
+
s = f"{base}/{s}"
|
| 25 |
+
lines.append(s)
|
| 26 |
+
return lines
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def build_gitignore_matcher(root: Path) -> Optional[Callable[[str], bool]]:
|
| 30 |
+
if PathSpec is None:
|
| 31 |
+
return None
|
| 32 |
+
lines = _collect_gitignore_lines(root)
|
| 33 |
+
top = root / ".gitignore"
|
| 34 |
+
if top.exists():
|
| 35 |
+
lines = top.read_text(encoding='utf-8', errors='ignore').splitlines() + lines
|
| 36 |
+
if not lines:
|
| 37 |
+
return None
|
| 38 |
+
spec = PathSpec.from_lines("gitwildmatch", lines)
|
| 39 |
+
def match(relpath: str) -> bool:
|
| 40 |
+
return spec.match_file(relpath)
|
| 41 |
+
return match
|
src/dir2md/license.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""License and feature gating for dir2md open-core model"""
|
| 2 |
+
import os
|
| 3 |
+
from typing import Set
|
| 4 |
+
|
| 5 |
+
class LicenseManager:
|
| 6 |
+
"""Manages feature access based on license type"""
|
| 7 |
+
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.license_key = os.environ.get('DIR2MD_LICENSE', '')
|
| 10 |
+
self.is_pro = self._validate_license()
|
| 11 |
+
|
| 12 |
+
def _validate_license(self) -> bool:
|
| 13 |
+
"""Validate license key (simplified for demo)"""
|
| 14 |
+
# In production, this would validate ed25519 signature
|
| 15 |
+
return self.license_key.startswith('PRO-') and len(self.license_key) > 10
|
| 16 |
+
|
| 17 |
+
def get_available_features(self) -> Set[str]:
|
| 18 |
+
"""Return set of available features based on license"""
|
| 19 |
+
base_features = {
|
| 20 |
+
'basic_masking',
|
| 21 |
+
'directory_scan',
|
| 22 |
+
'gitignore_filter',
|
| 23 |
+
'token_estimation',
|
| 24 |
+
'simhash_dedup',
|
| 25 |
+
'manifest_json',
|
| 26 |
+
'deterministic_output',
|
| 27 |
+
'basic_stats'
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
if self.is_pro:
|
| 31 |
+
pro_features = {
|
| 32 |
+
'advanced_masking',
|
| 33 |
+
'language_plugins',
|
| 34 |
+
'parallel_processing',
|
| 35 |
+
'incremental_cache',
|
| 36 |
+
'drift_comparison',
|
| 37 |
+
'html_pdf_export',
|
| 38 |
+
'pr_integration',
|
| 39 |
+
'tui_interface'
|
| 40 |
+
}
|
| 41 |
+
return base_features.union(pro_features)
|
| 42 |
+
|
| 43 |
+
return base_features
|
| 44 |
+
|
| 45 |
+
def check_feature(self, feature: str) -> bool:
|
| 46 |
+
"""Check if a feature is available"""
|
| 47 |
+
return feature in self.get_available_features()
|
| 48 |
+
|
| 49 |
+
def require_pro(self, feature: str) -> None:
|
| 50 |
+
"""Raise error if pro feature is accessed without license"""
|
| 51 |
+
if not self.check_feature(feature):
|
| 52 |
+
raise LicenseError(
|
| 53 |
+
f"Feature '{feature}' requires dir2md Pro license. "
|
| 54 |
+
f"Visit https://dir2md.com/pro for more information."
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
class LicenseError(Exception):
|
| 58 |
+
"""Raised when trying to access pro features without license"""
|
| 59 |
+
pass
|
| 60 |
+
|
| 61 |
+
# Global license manager instance
|
| 62 |
+
license_manager = LicenseManager()
|
src/dir2md/manifest.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import json, hashlib
|
| 4 |
+
|
| 5 |
+
def sha256_bytes(b: bytes) -> str:
|
| 6 |
+
return hashlib.sha256(b).hexdigest()
|
| 7 |
+
|
| 8 |
+
def write_manifest(data: dict, out: Path) -> None:
|
| 9 |
+
out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
src/dir2md/markdown.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import TYPE_CHECKING
|
| 5 |
+
|
| 6 |
+
if TYPE_CHECKING:
|
| 7 |
+
from .core import Config, Stats
|
| 8 |
+
|
| 9 |
+
def to_markdown(cfg: 'Config', tree_lines: list[str], file_blocks: list[tuple[Path, str, str]], stats: 'Stats') -> str:
|
| 10 |
+
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 11 |
+
parts: list[str] = []
|
| 12 |
+
parts.append("# Project Blueprint\n")
|
| 13 |
+
parts.append(f"- Root: `{cfg.root}` ")
|
| 14 |
+
if not cfg.no_timestamp:
|
| 15 |
+
parts.append(f"- Generated: `{ts}` ")
|
| 16 |
+
parts.append(f"- Preset: `{cfg.preset}` ")
|
| 17 |
+
parts.append(f"- LLM mode: `{cfg.llm_mode}` ")
|
| 18 |
+
parts.append(f"- Estimated tokens (prompt): `{stats.est_tokens_prompt}` ")
|
| 19 |
+
parts.append("")
|
| 20 |
+
parts.append("## Directory Tree\n")
|
| 21 |
+
parts.append("```\n" + "\n".join(tree_lines) + "\n````\n\n")
|
| 22 |
+
if cfg.llm_mode != "off" and file_blocks:
|
| 23 |
+
parts.append("## File Contents\n")
|
| 24 |
+
for path, lang, text in file_blocks:
|
| 25 |
+
rel = path.relative_to(cfg.root)
|
| 26 |
+
parts.append(f"### File: `{rel}`\n")
|
| 27 |
+
parts.append(f"```{lang}\n{text}\n\n````\n")
|
| 28 |
+
if cfg.add_stats:
|
| 29 |
+
parts.append("## Summary\n")
|
| 30 |
+
parts.append("| metric | value |\n|---|---:|")
|
| 31 |
+
parts.append(f"| dirs | {stats.total_dirs} |")
|
| 32 |
+
parts.append(f"| files in tree | {stats.total_files_in_tree} |")
|
| 33 |
+
parts.append(f"| selected files | {stats.total_with_contents} |")
|
| 34 |
+
parts.append(f"| omitted | {stats.total_omitted} |")
|
| 35 |
+
parts.append(f"| est tokens (prompt) | {stats.est_tokens_prompt} |\n")
|
| 36 |
+
return "\n".join(parts)
|
src/dir2md/parallel.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parallel processing module (Pro feature)"""
|
| 2 |
+
from .license import license_manager, LicenseError
|
| 3 |
+
|
| 4 |
+
def parallel_file_processing(files, processor_func):
|
| 5 |
+
"""Process files in parallel (Pro feature)"""
|
| 6 |
+
license_manager.require_pro('parallel_processing')
|
| 7 |
+
|
| 8 |
+
# This would contain actual parallel processing logic
|
| 9 |
+
# For demo, just show the restriction
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 11 |
+
|
| 12 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 13 |
+
return list(executor.map(processor_func, files))
|
| 14 |
+
|
| 15 |
+
def check_cache(file_path):
|
| 16 |
+
"""Check if file is cached (Pro feature)"""
|
| 17 |
+
license_manager.require_pro('incremental_cache')
|
| 18 |
+
|
| 19 |
+
# Cache checking logic would go here
|
| 20 |
+
return False # Simplified for demo
|
src/dir2md/simhash.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from typing import Iterable
|
| 3 |
+
import re, hashlib
|
| 4 |
+
|
| 5 |
+
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
|
| 6 |
+
|
| 7 |
+
def _tokens(s: str) -> list[str]:
|
| 8 |
+
return _TOKEN_RE.findall(s.lower())
|
| 9 |
+
|
| 10 |
+
def _shingles(seq: list[str], k: int = 4) -> Iterable[int]:
|
| 11 |
+
if k <= 0:
|
| 12 |
+
k = 4
|
| 13 |
+
for i in range(max(0, len(seq)-k+1)):
|
| 14 |
+
payload = " ".join(seq[i:i+k]).encode()
|
| 15 |
+
yield int.from_bytes(hashlib.blake2b(payload, digest_size=8).digest(), 'big')
|
| 16 |
+
|
| 17 |
+
def simhash64(s: str, k: int = 4) -> int:
|
| 18 |
+
v = [0]*64
|
| 19 |
+
for h in _shingles(_tokens(s), k=k):
|
| 20 |
+
for bit in range(64):
|
| 21 |
+
v[bit] += 1 if (h >> bit) & 1 else -1
|
| 22 |
+
out = 0
|
| 23 |
+
for bit in range(64):
|
| 24 |
+
if v[bit] > 0:
|
| 25 |
+
out |= (1<<bit)
|
| 26 |
+
return out
|
| 27 |
+
|
| 28 |
+
def hamming(a: int, b: int) -> int:
|
| 29 |
+
return (a ^ b).bit_count()
|
src/dir2md/summary.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import ast
|
| 4 |
+
|
| 5 |
+
def summarize(path: Path, content: str, max_lines: int = 60) -> str:
|
| 6 |
+
ext = path.suffix.lower()
|
| 7 |
+
if ext == ".py":
|
| 8 |
+
try:
|
| 9 |
+
tree = ast.parse(content)
|
| 10 |
+
funcs = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
|
| 11 |
+
clss = [n.name for n in tree.body if isinstance(n, ast.ClassDef)]
|
| 12 |
+
imps = []
|
| 13 |
+
for n in tree.body:
|
| 14 |
+
if isinstance(n, ast.Import):
|
| 15 |
+
imps.extend([a.name for a in n.names])
|
| 16 |
+
if isinstance(n, ast.ImportFrom):
|
| 17 |
+
imps.extend([a.name for a in n.names])
|
| 18 |
+
lines = []
|
| 19 |
+
if imps: lines.append(f"- imports: {', '.join(imps)[:200]}")
|
| 20 |
+
if clss: lines.append(f"- classes: {', '.join(clss)[:200]}")
|
| 21 |
+
if funcs: lines.append(f"- functions: {', '.join(funcs)[:200]}")
|
| 22 |
+
return "\n".join(lines) or "- (no symbols)"
|
| 23 |
+
except Exception:
|
| 24 |
+
pass
|
| 25 |
+
if ext in {".md", ".markdown"}:
|
| 26 |
+
heads = [ln.strip() for ln in content.splitlines() if ln.strip().startswith("#")][:10]
|
| 27 |
+
return "\n".join([f"- {h}" for h in heads]) or _first_lines(content, max_lines)
|
| 28 |
+
return _first_lines(content, max_lines)
|
| 29 |
+
|
| 30 |
+
def _first_lines(content: str, max_lines: int) -> str:
|
| 31 |
+
lines = content.splitlines()[:max_lines]
|
| 32 |
+
return "\n".join([f"- {ln}" for ln in lines if ln.strip()])
|
src/dir2md/token.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
def estimate_tokens(text: str) -> int:
|
| 4 |
+
# Simple estimation: 4 chars ≈ 1 token
|
| 5 |
+
return max(1, (len(text) + 3)//4)
|
tests/test_dir2md.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import json, tempfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from dir2md.core import Config, generate_markdown_report
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _make_repo(tmp: Path) -> Path:
|
| 8 |
+
(tmp/"src").mkdir(parents=True, exist_ok=True)
|
| 9 |
+
# Make this file long enough to trigger truncation
|
| 10 |
+
long_content = "\n".join([f" print('line {i}')" for i in range(100)])
|
| 11 |
+
(tmp/"src"/"a.py").write_text(f"""
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
class A: pass
|
| 15 |
+
|
| 16 |
+
def foo():
|
| 17 |
+
{long_content}
|
| 18 |
+
return 42
|
| 19 |
+
""", encoding="utf-8")
|
| 20 |
+
(tmp/"src"/"b.py").write_text("""
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
def bar():
|
| 24 |
+
return 43
|
| 25 |
+
""", encoding="utf-8")
|
| 26 |
+
# Similar file (for deduplication testing)
|
| 27 |
+
(tmp/"src"/"b_copy.py").write_text((tmp/"src"/"b.py").read_text(encoding="utf-8"), encoding="utf-8")
|
| 28 |
+
(tmp/"README.md").write_text("# Title\n\nSome text\n", encoding="utf-8")
|
| 29 |
+
return tmp
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_budget_and_modes(tmp_path: Path):
|
| 33 |
+
root = _make_repo(tmp_path)
|
| 34 |
+
cfg = Config(
|
| 35 |
+
root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
|
| 36 |
+
respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
|
| 37 |
+
include_contents=True, only_ext=None, add_stats=True, add_toc=False,
|
| 38 |
+
llm_mode="summary", budget_tokens=200, max_file_tokens=1200, dedup_bits=16,
|
| 39 |
+
sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
|
| 40 |
+
preset="pro", explain_capsule=True,
|
| 41 |
+
)
|
| 42 |
+
md = generate_markdown_report(cfg)
|
| 43 |
+
assert "Estimated tokens (prompt):" in md
|
| 44 |
+
mpath = (root/"OUT.manifest.json")
|
| 45 |
+
assert mpath.exists()
|
| 46 |
+
man = json.loads(mpath.read_text(encoding="utf-8"))
|
| 47 |
+
# b_copy.py likely to be excluded due to deduplication
|
| 48 |
+
paths = {entry["path"] for entry in man["files"]}
|
| 49 |
+
assert any(p.endswith("a.py") for p in paths)
|
| 50 |
+
assert any(p.endswith("b.py") for p in paths)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_ref_mode_manifest(tmp_path: Path):
|
| 54 |
+
root = _make_repo(tmp_path)
|
| 55 |
+
cfg = Config(
|
| 56 |
+
root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
|
| 57 |
+
respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
|
| 58 |
+
include_contents=True, only_ext=None, add_stats=True, add_toc=False,
|
| 59 |
+
llm_mode="ref", budget_tokens=120, max_file_tokens=1200, dedup_bits=16,
|
| 60 |
+
sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
|
| 61 |
+
preset="pro", explain_capsule=False,
|
| 62 |
+
)
|
| 63 |
+
md = generate_markdown_report(cfg)
|
| 64 |
+
man = json.loads((root/"OUT.manifest.json").read_text(encoding="utf-8"))
|
| 65 |
+
assert "stats" in man
|
| 66 |
+
assert "files" in man
|
| 67 |
+
assert all("sha256" in e for e in man["files"])
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_inline_sampling(tmp_path: Path):
|
| 71 |
+
root = _make_repo(tmp_path)
|
| 72 |
+
# Drastically reduced budget to trigger sampling
|
| 73 |
+
cfg = Config(
|
| 74 |
+
root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
|
| 75 |
+
respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=50,
|
| 76 |
+
include_contents=True, only_ext=None, add_stats=True, add_toc=False,
|
| 77 |
+
llm_mode="inline", budget_tokens=50, max_file_tokens=30, dedup_bits=0,
|
| 78 |
+
sample_head=5, sample_tail=3, strip_comments=False, emit_manifest=False,
|
| 79 |
+
preset="pro", explain_capsule=True,
|
| 80 |
+
)
|
| 81 |
+
md = generate_markdown_report(cfg)
|
| 82 |
+
assert "truncated middle" in md
|
| 83 |
+
assert "why: inline" in md
|
| 84 |
+
|
| 85 |
+
def test_masking(tmp_path: Path):
|
| 86 |
+
root = _make_repo(tmp_path)
|
| 87 |
+
# Add a file with a secret
|
| 88 |
+
secret_content = "My AWS key is AKIAIOSFODNN7EXAMPLE"
|
| 89 |
+
(root / ".env").write_text(secret_content, encoding="utf-8")
|
| 90 |
+
|
| 91 |
+
cfg = Config(
|
| 92 |
+
root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
|
| 93 |
+
respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
|
| 94 |
+
include_contents=True, only_ext=None, add_stats=True, add_toc=False,
|
| 95 |
+
llm_mode="inline", budget_tokens=1000, max_file_tokens=1000, dedup_bits=0,
|
| 96 |
+
sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=False,
|
| 97 |
+
preset="pro", explain_capsule=False, no_timestamp=True,
|
| 98 |
+
masking_mode="basic",
|
| 99 |
+
)
|
| 100 |
+
md = generate_markdown_report(cfg)
|
| 101 |
+
|
| 102 |
+
assert secret_content not in md
|
| 103 |
+
assert "[*** MASKED_SECRET ***]" in md
|
| 104 |
+
|
| 105 |
+
# Test with masking off
|
| 106 |
+
cfg.masking_mode = "off"
|
| 107 |
+
md_unmasked = generate_markdown_report(cfg)
|
| 108 |
+
assert secret_content in md_unmasked
|
| 109 |
+
assert "[*** MASKED_SECRET ***]" not in md_unmasked
|