Flamehaven commited on
Commit
d466b7d
·
0 Parent(s):

Initial commit: Dir2md open-core project

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "dir2md",
3
+ "image": "mcr.microsoft.com/devcontainers/python:3.11",
4
+ "postCreateCommand": "pip install -e . && pre-commit install",
5
+ "customizations": {
6
+ "vscode": {
7
+ "extensions": ["ms-python.python", "ms-python.vscode-pylance"]
8
+ }
9
+ }
10
+ }
.github/workflows/dir2md-blueprint.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: dir2md Blueprint
2
+ on:
3
+ pull_request:
4
+ types: [opened, synchronize, reopened]
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ build-blueprint:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: actions/setup-python@v5
13
+ with:
14
+ python-version: '3.11'
15
+ - name: Install dir2md
16
+ run: |
17
+ python -m pip install --upgrade pip
18
+ pip install .
19
+ - name: Generate blueprint
20
+ id: gen
21
+ run: |
22
+ dir2md . --capsule --emit-manifest --stats -o PROJECT_BLUEPRINT.md || true
23
+ TOKENS=$(jq .stats.est_tokens_prompt PROJECT_BLUEPRINT.manifest.json)
24
+ echo "tokens=$TOKENS" >> $GITHUB_OUTPUT
25
+ - name: Upload artifact
26
+ uses: actions/upload-artifact@v4
27
+ with:
28
+ name: dir2md-blueprint
29
+ path: |
30
+ PROJECT_BLUEPRINT.md
31
+ PROJECT_BLUEPRINT.manifest.json
32
+ PROJECT_BLUEPRINT.capsule.zip
33
+ - name: Comment PR
34
+ if: github.event_name == 'pull_request'
35
+ uses: actions/github-script@v7
36
+ with:
37
+ script: |
38
+ const tokens = '${{ steps.gen.outputs.tokens }}';
39
+ const body = [
40
+ '## 📦 dir2md Blueprint',
41
+ `- Estimated prompt tokens: **${tokens}**`,
42
+ '- Artifacts: _see workflow run → Artifacts_',
43
+ '',
44
+ 'Run locally:',
45
+ '```bash',
46
+ 'pip install .',
47
+ 'dir2md .',
48
+ '```'
49
+ ].join('\n');
50
+ github.rest.issues.createComment({
51
+ owner: context.repo.owner,
52
+ repo: context.repo.repo,
53
+ issue_number: context.issue.number,
54
+ body
55
+ });
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .env
6
+ .venv/
7
+ venv/
8
+ .idea/
9
+ .vscode/
10
+ .ipynb_checkpoints/
11
+ .dist/
12
+ .build/
13
+ .coverage
14
+ .pytest_cache/
15
+
16
+ # OS
17
+ .DS_Store
18
+ Thumbs.db
19
+
20
+ # Output files - exclude test outputs but keep documentation
21
+ *.manifest.json
22
+ *_blueprint.md
23
+ *_summary.md
24
+ *_output.md
25
+ test_output.md
26
+ example_*.md
27
+ pro_*.md
28
+ raw_*.md
29
+ secure_*.md
30
+ masking.py
31
+
32
+ # Keep important documentation
33
+ !README.md
34
+ !FEATURES.md
35
+ !CURRENT_FEATURES.md
36
+ !CONTRIBUTING.md
37
+ !CHANGELOG.md
38
+
39
+ # Virtual environment (large, not needed)
40
+ venv_clean/
41
+
42
+ # Additional ignores
43
+ .dir2md_cache/
44
+ tmp/
45
+ temp/
46
+
47
+ # Personal files to ignore
48
+ ENGLISH_CONVERSION_COMPLETE.md
49
+ USAGE_EXAMPLES.md
.pre-commit-config.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: dir2md-dry-run
5
+ name: dir2md (dry-run)
6
+ entry: bash -lc 'dir2md . --preset iceberg --emit-manifest --stats --dry-run >/dev/null || true'
7
+ language: system
8
+ pass_filenames: false
9
+ stages: [pre-commit]
Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ WORKDIR /work
3
+ COPY . /work
4
+ RUN pip install --no-cache-dir .
5
+ ENTRYPOINT ["dir2md"]
6
+ CMD ["."]
FEATURES.md ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dir2md Feature Comparison: Open Source vs Pro
2
+
3
+ > **Transform your codebase into LLM-optimized markdown blueprints**
4
+
5
+ Dir2md follows an **Open-Core** model - providing essential functionality for free while offering advanced features for professional teams and power users.
6
+
7
+ ## 🎯 Quick Comparison
8
+
9
+ | Feature Category | Open Source (Free) | Pro Version |
10
+ |------------------|-------------------|-------------|
11
+ | **Basic Functionality** | ✅ Full Access | ✅ Enhanced |
12
+ | **Security & Masking** | ✅ Basic Patterns | ✅ Advanced + Custom |
13
+ | **Performance** | ✅ Single-threaded | ✅ Parallel + Caching |
14
+ | **Export Options** | ✅ Markdown Only | ✅ HTML, PDF, Slides |
15
+ | **Team Features** | ❌ Individual Use | ✅ CI/CD Integration |
16
+ | **Language Support** | ✅ Basic Analysis | ✅ Smart Plugins |
17
+
18
+ ---
19
+
20
+ ## 🔓 Open Source Features (MIT License)
21
+
22
+ ### Core Functionality
23
+ - **📁 Directory Scanning**: Complete file tree analysis with `.gitignore` support
24
+ - **🎯 Smart Filtering**: Include/exclude/omit glob patterns
25
+ - **📊 Token Optimization**: Head/tail sampling with configurable budgets
26
+ - **🔄 Duplicate Detection**: SimHash-based content deduplication
27
+ - **📋 Manifest Generation**: JSON metadata with file hashes and statistics
28
+ - **⏰ Deterministic Output**: `--no-timestamp` for reproducible builds
29
+ - **🎨 Multiple Presets**: `iceberg`, `pro`, `raw` (default: `raw` for developers)
30
+
31
+ ### Basic Security
32
+ - **🛡️ Essential Masking**: Protection for common secrets
33
+ - AWS Access Keys (`AKIA[0-9A-Z]{16}`)
34
+ - Bearer Tokens (`Bearer <token>`)
35
+ - Private Keys (`-----BEGIN ... PRIVATE KEY-----`)
36
+
37
+ ### Output Modes
38
+ - **📝 Reference Mode**: File listings with metadata
39
+ - **📖 Summary Mode**: Condensed content overview
40
+ - **📄 Inline Mode**: Full content inclusion (within token budget)
41
+
42
+ ### CLI & Integration
43
+ - **⚡ Command Line Interface**: Full-featured CLI with help system
44
+ - **🔧 Configurable Options**: Extensive customization via arguments
45
+ - **📦 Easy Installation**: `pip install dir2md`
46
+
47
+ ---
48
+
49
+ ## 🔒 Pro Version Features
50
+
51
+ ### Advanced Security & Compliance
52
+ - **🛡️ Comprehensive Masking**: 25+ built-in patterns
53
+ - Cloud Provider Keys (AWS, Azure, GCP)
54
+ - API Tokens (Slack, GitHub, GitLab)
55
+ - Database Connections & Credentials
56
+ - Custom Pattern Support
57
+ - **🔍 Smart Detection**: File-type aware masking
58
+ - **✅ False Positive Reduction**: Context-aware pattern matching
59
+ - **📝 Audit Logging**: Security scanning reports
60
+
61
+ ### Performance & Scale
62
+ - **⚡ Parallel Processing**: Multi-threaded file analysis
63
+ - **💾 Incremental Caching**: `.dir2md_cache/` for faster re-runs
64
+ - **📈 Large Repository Support**: Optimized for 10,000+ files
65
+ - **🚀 Streaming Processing**: Memory-efficient for massive codebases
66
+
67
+ ### Advanced Analysis
68
+ - **🧠 Language Plugins**: Smart code analysis
69
+ - **Python**: AST parsing, function/class extraction
70
+ - **JavaScript/TypeScript**: ES module analysis, export detection
71
+ - **Go**: Package structure, type definitions
72
+ - **Java**: Class hierarchy, annotation extraction
73
+ - **📊 Drift Detection**: Compare blueprint versions
74
+ - **🎯 Impact Scoring**: Identify critical changes
75
+
76
+ ### Export & Sharing
77
+ - **📄 Multiple Formats**: HTML, PDF, PowerPoint slides
78
+ - **🎨 Custom Templates**: Branded output with Jinja2
79
+ - **📱 Responsive HTML**: Mobile-friendly documentation
80
+ - **🖨️ Print Optimization**: Publication-ready PDFs
81
+
82
+ ### Team & CI/CD Integration
83
+ - **🤖 GitHub Actions**: Automated blueprint generation
84
+ - **💬 PR Comments**: Automatic documentation updates
85
+ - **🔗 GitLab Integration**: Pipeline integration support
86
+ - **📋 Status Checks**: Quality gates for documentation
87
+ - **👥 Team Templates**: Standardized output formats
88
+
89
+ ### Developer Experience
90
+ - **🖥️ Terminal UI (TUI)**: Interactive file selection
91
+ - **🔍 Live Preview**: Real-time output preview
92
+ - **⚙️ Advanced Configuration**: Team-wide settings
93
+ - **📊 Analytics Dashboard**: Usage metrics and insights
94
+
95
+ ---
96
+
97
+ ## 💰 Pricing & Licensing
98
+
99
+ ### Open Source (MIT)
100
+ - **Price**: Free forever
101
+ - **Use Case**: Individual developers, small projects
102
+ - **Support**: Community via GitHub Issues
103
+ - **License**: MIT - commercial use allowed
104
+
105
+ ### Pro Version
106
+ - **Individual**: $29/month or $290/year
107
+ - **Team (5 users)**: $99/month or $990/year
108
+ - **Enterprise**: Custom pricing with on-premise options
109
+ - **Support**: Priority email support + documentation
110
+ - **License**: Commercial license with usage analytics opt-out
111
+
112
+ ---
113
+
114
+ ## 🚀 Usage Examples
115
+
116
+ ### Open Source Quick Start
117
+
118
+ ```bash
119
+ # Install from PyPI
120
+ pip install dir2md
121
+
122
+ # Basic usage with security masking
123
+ dir2md ./my-project --masking basic --preset raw
124
+
125
+ # Generate with manifest for CI/CD
126
+ dir2md . --emit-manifest --no-timestamp --output blueprint.md
127
+ ```
128
+
129
+ ### Pro Version Examples
130
+
131
+ ```bash
132
+ # Set Pro license
133
+ export DIR2MD_LICENSE="PRO-your-license-key"
134
+
135
+ # Advanced masking with custom patterns
136
+ dir2md . --masking advanced --preset pro
137
+
138
+ # Parallel processing with caching
139
+ dir2md ./large-repo --parallel --use-cache
140
+
141
+ # Generate multiple formats
142
+ dir2md . --export html,pdf --template branded
143
+ ```
144
+
145
+ ### GitHub Actions Integration
146
+
147
+ **Open Source:**
148
+ ```yaml
149
+ - name: Generate Blueprint
150
+ run: |
151
+ pip install dir2md
152
+ dir2md . --no-timestamp --output docs/blueprint.md
153
+ ```
154
+
155
+ **Pro Version:**
156
+ ```yaml
157
+ - name: Generate Pro Blueprint
158
+ env:
159
+ DIR2MD_LICENSE: ${{ secrets.DIR2MD_PRO_LICENSE }}
160
+ run: |
161
+ pip install dir2md-pro
162
+ dir2md . --masking advanced --export html --pr-comment
163
+ ```
164
+
165
+ ---
166
+
167
+ ## 🎯 When to Upgrade to Pro
168
+
169
+ ### Individual Developers
170
+ - Working with sensitive codebases requiring advanced security
171
+ - Need faster processing for large repositories (1000+ files)
172
+ - Want professional-looking exports for client presentations
173
+ - Require language-specific code analysis
174
+
175
+ ### Teams & Organizations
176
+ - Standardizing documentation across multiple projects
177
+ - Integrating with CI/CD pipelines for automatic updates
178
+ - Need compliance features for security auditing
179
+ - Want team analytics and usage insights
180
+
181
+ ### Enterprise Users
182
+ - On-premise deployment requirements
183
+ - SSO/SAML integration needs
184
+ - Custom security patterns and compliance rules
185
+ - Dedicated support and SLA requirements
186
+
187
+ ---
188
+
189
+ ## 🛠️ Technical Implementation
190
+
191
+ ### Open-Core Architecture
192
+ ```
193
+ dir2md-core (OSS) dir2md-pro (Commercial)
194
+ ├── CLI Interface ├── Advanced Masking
195
+ ├── File Scanning ├── Language Plugins
196
+ ├── Token Optimization ├── Parallel Engine
197
+ ├── Basic Masking ├── Export Templates
198
+ ├── Manifest Generation ├── Team Integration
199
+ └── Markdown Output └── License Validation
200
+ ```
201
+
202
+ ### License Validation
203
+ - **Runtime Check**: Environment variable `DIR2MD_LICENSE`
204
+ - **Offline Validation**: Ed25519 signature verification
205
+ - **Graceful Degradation**: Falls back to OSS features if invalid
206
+ - **No Phone Home**: All validation happens locally
207
+
208
+ ### Plugin System
209
+ ```python
210
+ # Pro Plugin Example
211
+ class PythonAnalyzer(LanguagePlugin):
212
+ extensions = {'.py'}
213
+
214
+ def analyze(self, content: str) -> Dict[str, Any]:
215
+ return {
216
+ 'functions': self.extract_functions(content),
217
+ 'classes': self.extract_classes(content),
218
+ 'imports': self.extract_imports(content)
219
+ }
220
+ ```
221
+
222
+ ---
223
+
224
+ ## 🆚 Comparison with Alternatives
225
+
226
+ | Tool | Open Source | Pro Features | License Model |
227
+ |------|-------------|--------------|---------------|
228
+ | **dir2md** | ✅ Full core functionality | ✅ Advanced security, performance, team features | Open-Core (MIT + Commercial) |
229
+ | tree + cat | ✅ Basic listing | ❌ No advanced features | Free (but manual) |
230
+ | Proprietary doc tools | ❌ Closed source | ✅ Enterprise features | Subscription only |
231
+ | Custom scripts | ✅ DIY solution | ❌ No standardization | Time investment |
232
+
233
+ ---
234
+
235
+ ## 📞 Get Started
236
+
237
+ ### Try Open Source
238
+ ```bash
239
+ pip install dir2md
240
+ dir2md --help
241
+ ```
242
+
243
+ ### Evaluate Pro Features
244
+ ```bash
245
+ # 14-day free trial
246
+ export DIR2MD_LICENSE="TRIAL-request-at-dir2md.com"
247
+ pip install dir2md-pro
248
+ dir2md --masking advanced --parallel
249
+ ```
250
+
251
+ ### Purchase Pro License
252
+ - **Individual**: [Buy now for $29/month](https://dir2md.com/buy/individual)
253
+ - **Team**: [Start team trial](https://dir2md.com/buy/team)
254
+ - **Enterprise**: [Contact sales](https://dir2md.com/contact)
255
+
256
+ ---
257
+
258
+ ## 🤝 Contributing
259
+
260
+ Dir2md's open-source core welcomes contributions:
261
+
262
+ - **Bug Reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
263
+ - **Feature Requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
264
+ - **Code Contributions**: See [CONTRIBUTING.md](CONTRIBUTING.md)
265
+ - **Documentation**: Help improve our guides and examples
266
+
267
+ Pro features are developed in-house but benefit from community feedback and OSS improvements.
268
+
269
+ ---
270
+
271
+ *Made with ❤️ for developers who value great documentation*
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yoon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dir2md
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
5
+
6
+ > Transform your codebase into LLM-optimized markdown blueprints
7
+
8
+ Dir2md analyzes directory structures and generates comprehensive markdown documentation optimized for Large Language Models. It intelligently samples content, removes duplicates, and provides token-budget control to create the perfect context for AI-assisted development.
9
+
10
+ ## ✨ Key Features
11
+
12
+ - **🎯 Smart Content Sampling**: Head/tail sampling with configurable token budgets
13
+ - **🔄 Duplicate Detection**: SimHash-based deduplication to reduce noise
14
+ - **🛡️ Security First**: Built-in secret masking (basic OSS, advanced Pro)
15
+ - **📊 Multiple Output Modes**: Reference, summary, or full inline content
16
+ - **🔧 Highly Configurable**: Extensive filtering and customization options
17
+ - **⚡ Developer Friendly**: Raw mode default for complete code visibility
18
+
19
+ ## 🚀 Quick Start
20
+
21
+ ### Installation
22
+
23
+ ```bash
24
+ # From source (current)
25
+ git clone https://github.com/your-org/dir2md.git
26
+ cd dir2md
27
+ python -m src.dir2md.cli --help
28
+
29
+ # Coming soon: PyPI installation
30
+ pip install dir2md
31
+ ```
32
+
33
+ ### Basic Usage
34
+
35
+ ```bash
36
+ # Generate project blueprint (developer-friendly raw mode)
37
+ dir2md .
38
+
39
+ # With basic security masking
40
+ dir2md . --masking basic
41
+
42
+ # Generate with manifest for CI/CD
43
+ dir2md . --emit-manifest --no-timestamp
44
+
45
+ # Token-optimized for LLM context
46
+ dir2md . --budget-tokens 4000 --preset iceberg
47
+ ```
48
+
49
+ ### Output Example
50
+
51
+ ```markdown
52
+ # Project Blueprint
53
+
54
+ - Root: `/path/to/project`
55
+ - Generated: `2025-09-08 12:30:15`
56
+ - Preset: `raw`
57
+ - LLM mode: `inline`
58
+ - Estimated tokens (prompt): `6247`
59
+
60
+ ## Directory Tree
61
+ [Complete file structure]
62
+
63
+ ## Statistics
64
+ | Metric | Value |
65
+ |--------|-------|
66
+ | Total files | 42 |
67
+ | Estimated tokens | 6247 |
68
+
69
+ ## File Contents
70
+ [Intelligently sampled content...]
71
+ ```
72
+
73
+ ## 📋 Available Presets
74
+
75
+ | Preset | Description | Best For |
76
+ |--------|-------------|-----------|
77
+ | `raw` | Full content inclusion | Development, code review |
78
+ | `iceberg` | Balanced sampling | General documentation |
79
+ | `pro` | Advanced optimization | Large projects, LLM context |
80
+
81
+ ## 🔒 Open-Core Model
82
+
83
+ ### Free (OSS) Features
84
+ - Complete directory analysis
85
+ - Token optimization and sampling
86
+ - SimHash deduplication
87
+ - Basic security masking (3 patterns)
88
+ - All output modes and presets
89
+ - Deterministic builds
90
+
91
+ ### Pro Features
92
+ - Advanced security masking (9+ patterns)
93
+ - Parallel processing & caching
94
+ - Language-specific analysis plugins
95
+ - HTML/PDF export options
96
+ - Team integration (CI/CD, PR bots)
97
+ - Priority support
98
+
99
+ [Learn more about Pro features](FEATURES.md)
100
+
101
+ ## 📖 Documentation
102
+
103
+ - **[Feature Comparison](FEATURES.md)** - Complete OSS vs Pro breakdown
104
+ - **[Current Status](CURRENT_FEATURES.md)** - What's implemented now
105
+ - **[Usage Examples](USAGE_EXAMPLES.md)** - Hands-on guide with examples
106
+
107
+ ## 🛠️ CLI Reference
108
+
109
+ ```bash
110
+ # Basic options
111
+ dir2md [path] -o output.md --preset [iceberg|pro|raw]
112
+
113
+ # Token control
114
+ --budget-tokens 6000 # Total token budget
115
+ --max-file-tokens 1200 # Per-file token limit
116
+ --sample-head 120 # Lines from file start
117
+ --sample-tail 40 # Lines from file end
118
+
119
+ # Filtering
120
+ --include-glob "*.py,*.md" # Include patterns
121
+ --exclude-glob "test*,*.tmp" # Exclude patterns
122
+ --only-ext "py,js,ts" # File extensions only
123
+
124
+ # Security
125
+ --masking [off|basic|advanced] # Secret masking level
126
+
127
+ # Output
128
+ --emit-manifest # Generate JSON metadata
129
+ --no-timestamp # Reproducible output
130
+ --dry-run # Preview without writing
131
+ ```
132
+
133
+ ## 🤝 Contributing
134
+
135
+ We welcome contributions! Dir2md follows an open-core model:
136
+
137
+ - **Core functionality**: Open source (this repo)
138
+ - **Advanced features**: Commercial (separate repo)
139
+ - **Community**: All discussions welcome
140
+
141
+ ### Development Setup
142
+
143
+ ```bash
144
+ git clone https://github.com/your-org/dir2md.git
145
+ cd dir2md
146
+ python -m pytest -v # Run tests
147
+ python -m src.dir2md.cli . --dry-run # Test CLI
148
+ ```
149
+
150
+ ### Reporting Issues
151
+
152
+ - 🐛 **Bug reports**: [GitHub Issues](https://github.com/your-org/dir2md/issues)
153
+ - 💡 **Feature requests**: [GitHub Discussions](https://github.com/your-org/dir2md/discussions)
154
+ - 📧 **Security issues**: [email protected]
155
+
156
+ ## 📄 License
157
+
158
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
159
+
160
+ Pro features are available under a separate commercial license.
161
+
162
+ ## 🌟 Why Dir2md?
163
+
164
+ Traditional documentation approaches fall short when working with AI assistants:
165
+
166
+ - **Too much noise**: Raw `tree` + `cat` includes irrelevant files
167
+ - **Token waste**: Unoptimized content hits LLM context limits
168
+ - **Security risks**: Accidental exposure of secrets and keys
169
+ - **No structure**: Difficult for AI to understand project layout
170
+
171
+ Dir2md solves these problems with intelligent analysis, sampling, and optimization specifically designed for the AI era.
172
+
173
+ ---
174
+
175
+ *Made with ❤️ for developers who want their AI to understand their code*
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dir2md"
7
+ version = "0.0.1"
8
+ description = "Generate a Markdown blueprint: directory tree + optional file contents (token-optimized, ICEBERG preset)"
9
+ readme = "README.md"
10
+ authors = [{name = "Flamehaven", email = "[email protected]"}]
11
+ license = {text = "MIT"}
12
+ requires-python = ">=3.9"
13
+ dependencies = ["pathspec>=0.12.0"]
14
+
15
+ [project.scripts]
16
+ dir2md = "dir2md.cli:main"
17
+
18
+ [tool.setuptools]
19
+ package-dir = {"" = "src"}
20
+
21
+ [tool.setuptools.packages.find]
22
+ where = ["src"]
scripts/bench_dir2md.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import time, json, argparse
3
+ from pathlib import Path
4
+ from dir2md.core import Config, generate_markdown_report
5
+
6
+ def run_case(root: Path, preset: str, mode: str | None, budget: int, file_budget: int) -> dict:
7
+ cfg = Config(
8
+ root=root, output=root/"_BENCH.md", include_globs=[], exclude_globs=[], omit_globs=[],
9
+ respect_gitignore=True, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
10
+ include_contents=True, only_ext=None, add_stats=True, add_toc=False,
11
+ llm_mode=(mode or "ref"), budget_tokens=budget, max_file_tokens=file_budget,
12
+ dedup_bits=16, sample_head=120, sample_tail=40, strip_comments=False,
13
+ emit_manifest=False, preset=preset, explain_capsule=True,
14
+ )
15
+ t0 = time.perf_counter()
16
+ md = generate_markdown_report(cfg)
17
+ dt = time.perf_counter() - t0
18
+ est = md.split("Estimated tokens (prompt): `")[-1].split("`")[0]
19
+ return {"preset": preset, "mode": cfg.llm_mode, "elapsed_sec": round(dt,3), "est_tokens": int(est)}
20
+
21
+
22
+ def main():
23
+ ap = argparse.ArgumentParser()
24
+ ap.add_argument("path", nargs="?", default=".")
25
+ ns = ap.parse_args()
26
+ root = Path(ns.path).resolve()
27
+ cases = [
28
+ ("iceberg", None, 6000, 1000),
29
+ ("pro", "summary", 6000, 1000),
30
+ ("pro", "ref", 4000, 1000),
31
+ ("pro", "inline", 8000, 1200),
32
+ ]
33
+ rows = [run_case(root, *c) for c in cases]
34
+ print(json.dumps(rows, indent=2))
35
+
36
+ if __name__ == "__main__":
37
+ main()
src/dir2md/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __all__ = ["__version__"]
2
+ __version__ = "0.0.1"
src/dir2md/cli.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import argparse, zipfile, hashlib
3
+ from pathlib import Path
4
+ from .core import Config, generate_markdown_report
5
+ from . import __version__
6
+
7
+ DEFAULT_EXCLUDES = [
8
+ ".git", "__pycache__", "node_modules", ".venv",
9
+ "build", "dist", "*.pyc", ".DS_Store",
10
+ ]
11
+
12
+ def positive_int(v: str) -> int:
13
+ try:
14
+ iv = int(v)
15
+ except ValueError:
16
+ raise argparse.ArgumentTypeError("Please enter an integer value.")
17
+ if iv <= 0:
18
+ raise argparse.ArgumentTypeError("Only positive integers are allowed.")
19
+ return iv
20
+
21
+ def main(argv: list[str] | None = None) -> int:
22
+ ap = argparse.ArgumentParser(prog="dir2md", description="Directory → Markdown exporter with LLM optimization")
23
+ ap.add_argument("path", nargs="?", default=".")
24
+ ap.add_argument("-o", "--output", default="PROJECT_BLUEPRINT.md")
25
+
26
+ # Preset options
27
+ ap.add_argument("--preset", default="raw", choices=["iceberg","pro","raw"], help="Preset mode: iceberg/pro/raw")
28
+
29
+ # Token and selection control
30
+ ap.add_argument("--llm-mode", choices=["off","ref","summary","inline"], default=None)
31
+ ap.add_argument("--budget-tokens", type=int, default=6000)
32
+ ap.add_argument("--max-file-tokens", type=int, default=1200)
33
+ ap.add_argument("--dedup", type=int, default=16)
34
+ ap.add_argument("--sample-head", type=int, default=120)
35
+ ap.add_argument("--sample-tail", type=int, default=40)
36
+ ap.add_argument("--explain", action="store_true", help="Include selection rationale and drift_score in capsule comments")
37
+
38
+ # Filtering and safety controls
39
+ ap.add_argument("--include-glob", action="append", default=[])
40
+ ap.add_argument("--exclude-glob", action="append", default=[])
41
+ ap.add_argument("--omit-glob", action="append", default=[])
42
+ ap.add_argument("--only-ext", default="")
43
+ ap.add_argument("--respect-gitignore", action="store_true")
44
+ ap.add_argument("--follow-symlinks", action="store_true")
45
+ ap.add_argument("--max-bytes", type=positive_int, default=200_000)
46
+ ap.add_argument("--max-lines", type=positive_int, default=2000)
47
+
48
+ # Output options
49
+ ap.add_argument("--emit-manifest", action="store_true")
50
+ ap.add_argument("--stats", action="store_true")
51
+ ap.add_argument("--capsule", action="store_true", help="Package md+manifest into zip")
52
+ ap.add_argument("--dry-run", action="store_true")
53
+ ap.add_argument("--no-timestamp", action="store_true", help="Omit timestamp for reproducible output")
54
+ ap.add_argument("--masking", choices=["off", "basic", "advanced"], default="off", help="Secret masking mode (advanced requires Pro license)")
55
+
56
+ ap.add_argument("-V", "--version", action="version", version=f"dir2md {__version__}")
57
+
58
+ ns = ap.parse_args(argv)
59
+
60
+ root = Path(ns.path).resolve()
61
+ output = Path(ns.output)
62
+ only_ext = {e.strip().lstrip('.') for e in ns.only_ext.split(',') if e.strip()} or None
63
+
64
+ cfg = Config(
65
+ root=root,
66
+ output=output,
67
+ include_globs=list(ns.include_glob),
68
+ exclude_globs=list(ns.exclude_glob or DEFAULT_EXCLUDES),
69
+ omit_globs=list(ns.omit_glob),
70
+ respect_gitignore=bool(ns.respect_gitignore),
71
+ follow_symlinks=bool(ns.follow_symlinks),
72
+ max_bytes=int(ns.max_bytes) if ns.max_bytes else None,
73
+ max_lines=int(ns.max_lines) if ns.max_lines else None,
74
+ include_contents=True,
75
+ only_ext=only_ext,
76
+ add_stats=bool(ns.stats or True),
77
+ add_toc=False,
78
+ llm_mode=(ns.llm_mode or "ref"),
79
+ budget_tokens=int(ns.budget_tokens),
80
+ max_file_tokens=int(ns.max_file_tokens),
81
+ dedup_bits=int(ns.dedup),
82
+ sample_head=int(ns.sample_head),
83
+ sample_tail=int(ns.sample_tail),
84
+ strip_comments=False,
85
+ emit_manifest=bool(ns.emit_manifest),
86
+ preset=str(ns.preset),
87
+ explain_capsule=bool(ns.explain),
88
+ no_timestamp=bool(ns.no_timestamp),
89
+ masking_mode=str(ns.masking),
90
+ )
91
+
92
+ md = generate_markdown_report(cfg)
93
+
94
+ if ns.dry_run:
95
+ h = hashlib.sha256(md.encode('utf-8')).hexdigest()[:10]
96
+ print(f"[dry-run] preset={cfg.preset} mode={cfg.llm_mode} est_tokens~{cfg.budget_tokens} md={h}")
97
+ return 0
98
+
99
+ output.write_text(md, encoding="utf-8")
100
+ if ns.capsule:
101
+ with zipfile.ZipFile(output.with_suffix('.capsule.zip'), 'w') as z:
102
+ z.write(output)
103
+ if cfg.emit_manifest and output.with_suffix('.manifest.json').exists():
104
+ z.write(output.with_suffix('.manifest.json'))
105
+ print(f"[dir2md] Wrote: {output}")
106
+ return 0
107
+
108
+ if __name__ == "__main__":
109
+ raise SystemExit(main())
src/dir2md/core.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import List, Optional
5
+ import json
6
+
7
+ from .gitignore import build_gitignore_matcher
8
+ from .markdown import to_markdown
9
+ from .simhash import simhash64, hamming
10
+ from .summary import summarize
11
+ from .manifest import sha256_bytes, write_manifest
12
+ from .token import estimate_tokens
13
+ from .masking import apply_masking
14
+
15
+ @dataclass
16
+ class Stats:
17
+ total_dirs: int = 0
18
+ total_files_in_tree: int = 0
19
+ total_omitted: int = 0
20
+ total_with_contents: int = 0
21
+ est_tokens_prompt: int = 0
22
+
23
+ @dataclass
24
+ class Config:
25
+ root: Path
26
+ output: Path
27
+ include_globs: List[str]
28
+ exclude_globs: List[str]
29
+ omit_globs: List[str]
30
+ respect_gitignore: bool
31
+ follow_symlinks: bool
32
+ max_bytes: Optional[int]
33
+ max_lines: Optional[int]
34
+ include_contents: bool
35
+ only_ext: Optional[set[str]] = None
36
+ add_stats: bool = True
37
+ add_toc: bool = False
38
+ # Preset/token related
39
+ llm_mode: str = "ref" # off|ref|summary|inline
40
+ budget_tokens: int = 6000
41
+ max_file_tokens: int = 1200
42
+ dedup_bits: int = 16
43
+ sample_head: int = 120
44
+ sample_tail: int = 40
45
+ strip_comments: bool = False
46
+ emit_manifest: bool = True
47
+ preset: str = "iceberg"
48
+ explain_capsule: bool = False
49
+ no_timestamp: bool = False
50
+ masking_mode: str = "basic"
51
+
52
+ _DEFAULT_ONLY_EXT = {"py","ts","tsx","js","jsx","md","txt","toml","yaml","yml","json", ""}
53
+
54
+
55
+ def apply_preset(cfg: Config) -> Config:
56
+ try:
57
+ total_bytes = sum((f.stat().st_size for f in cfg.root.rglob('*') if f.is_file()))
58
+ except Exception:
59
+ total_bytes = 0
60
+ if cfg.preset == "iceberg":
61
+ cfg.respect_gitignore = True
62
+ if not cfg.only_ext:
63
+ cfg.only_ext = set(_DEFAULT_ONLY_EXT)
64
+ cfg.dedup_bits = 16
65
+ cfg.emit_manifest = True
66
+ # Auto-determine mode based on repository size
67
+ if total_bytes < 200_000:
68
+ cfg.llm_mode = "inline"; cfg.budget_tokens = min(cfg.budget_tokens, 6000); cfg.max_file_tokens = 1000
69
+ elif total_bytes < 5_000_000:
70
+ cfg.llm_mode = "summary"; cfg.budget_tokens = min(cfg.budget_tokens, 6000)
71
+ else:
72
+ cfg.llm_mode = "ref"; cfg.budget_tokens = min(cfg.budget_tokens, 4000)
73
+ elif cfg.preset == "raw":
74
+ cfg.llm_mode = "inline"; cfg.dedup_bits = 0; cfg.only_ext = None; cfg.emit_manifest = False
75
+ # pro: maintain user settings
76
+ return cfg
77
+
78
+
79
+ def generate_markdown_report(cfg: Config) -> str:
80
+ cfg = apply_preset(cfg)
81
+ root = cfg.root
82
+ if not root.exists():
83
+ raise FileNotFoundError(f"Path does not exist: {root}")
84
+ if not root.is_dir():
85
+ raise NotADirectoryError(f"Path is not a directory: {root}")
86
+
87
+ gitignore = build_gitignore_matcher(root) if cfg.respect_gitignore else None
88
+
89
+ def is_ignored(p: Path) -> bool:
90
+ if gitignore and gitignore(str(p.relative_to(root) if p != root else "")):
91
+ return True
92
+ for pat in cfg.exclude_globs:
93
+ if p.match(pat) or any(part == pat for part in p.parts):
94
+ return True
95
+ return False
96
+
97
+ def is_omitted(p: Path) -> bool:
98
+ for pat in cfg.omit_globs:
99
+ if p.match(pat) or any(part == pat for part in p.parts):
100
+ return True
101
+ return False
102
+
103
+ # Tree & file collection
104
+ tree_lines: list[str] = [str(root)]
105
+ files: list[Path] = []
106
+ stats = Stats() # Pre-create for accurate directory counting
107
+
108
+ def walk(current: Path, prefix: str = "") -> None:
109
+ # Count when entering directory
110
+ stats.total_dirs += 1
111
+ try:
112
+ entries = sorted(list(current.iterdir()), key=lambda x: (not x.is_dir(), x.name.lower()))
113
+ except PermissionError:
114
+ return
115
+ entries = [e for e in entries if not is_ignored(e)]
116
+ for i, child in enumerate(entries):
117
+ last = (i == len(entries)-1)
118
+ joint = "└── " if last else "├── "
119
+ tree_lines.append(f"{prefix}{joint}{child.name}")
120
+ if child.is_dir():
121
+ walk(child, prefix + (" " if last else "│ "))
122
+ else:
123
+ files.append(child)
124
+
125
+ walk(root)
126
+
127
+ # Generate candidates + deduplication
128
+ candidates: list[dict] = []
129
+ sim_seen: list[int] = []
130
+ for f in files:
131
+ if cfg.only_ext and f.suffix.lstrip(".").lower() not in cfg.only_ext:
132
+ continue
133
+ if is_omitted(f):
134
+ continue
135
+ try:
136
+ raw = f.read_bytes()
137
+ except Exception:
138
+ continue
139
+ if cfg.max_bytes and len(raw) > cfg.max_bytes:
140
+ raw = raw[: cfg.max_bytes]
141
+ text = raw.decode("utf-8", errors="replace")
142
+ if cfg.masking_mode != "off":
143
+ text = apply_masking(text, mode=cfg.masking_mode)
144
+ sh = simhash64(text)
145
+ # Deduplication
146
+ if cfg.dedup_bits > 0 and any(hamming(sh, h0) <= cfg.dedup_bits for h0 in sim_seen):
147
+ continue
148
+ sim_seen.append(sh)
149
+ candidates.append({
150
+ "path": f,
151
+ "sha256": sha256_bytes(raw),
152
+ "summary": summarize(f, text, max_lines=40),
153
+ "text": text,
154
+ "simhash": sh,
155
+ })
156
+
157
+ # Apply budget + reflect mode (Explain & Drift)
158
+ est_total = 0
159
+ selected_blocks: list[tuple[Path, str, str]] = []
160
+ selected_hashes: list[int] = []
161
+ def drift_score_bits(sh: int) -> int:
162
+ if not selected_hashes:
163
+ return 64
164
+ return min((hamming(sh, prev) for prev in selected_hashes), default=64)
165
+
166
+ for rec in candidates:
167
+ if cfg.llm_mode == "off":
168
+ break
169
+ sh = rec["simhash"]
170
+ drift_bits = drift_score_bits(sh)
171
+ drift = round(drift_bits / 64, 3) # 0~1, higher = fresher
172
+ if cfg.llm_mode == "ref":
173
+ meta = json.dumps({"sha256": rec["sha256"], "path": str(rec["path"]), "drift": drift}, ensure_ascii=False)
174
+ tok = estimate_tokens(meta) + 16
175
+ if est_total + tok > cfg.budget_tokens:
176
+ continue
177
+ est_total += tok
178
+ selected_blocks.append((rec["path"], "json", meta))
179
+ selected_hashes.append(sh)
180
+ elif cfg.llm_mode == "summary":
181
+ payload = rec["summary"]
182
+ tok = estimate_tokens(payload)
183
+ if est_total + tok > cfg.budget_tokens:
184
+ continue
185
+ est_total += tok
186
+ text = payload
187
+ if cfg.explain_capsule:
188
+ text += f"\n\n<!-- why: summary; drift={drift} -->"
189
+ selected_blocks.append((rec["path"], "markdown", text))
190
+ selected_hashes.append(sh)
191
+ else: # inline
192
+ lines = rec["text"].splitlines()
193
+ if cfg.max_lines and len(lines) > cfg.max_lines:
194
+ lines = lines[: cfg.max_lines]
195
+ content = "\n".join(lines)
196
+ if estimate_tokens(content) > cfg.max_file_tokens:
197
+ head = lines[: cfg.sample_head]
198
+ tail = lines[-cfg.sample_tail:] if cfg.sample_tail > 0 else []
199
+ mid = f"\n<!-- [truncated middle: {max(0, len(lines)-len(head)-len(tail))} lines omitted] -->\n"
200
+ content = "\n".join(head + [mid] + tail)
201
+ tok = min(cfg.max_file_tokens, estimate_tokens(content))
202
+ if est_total + tok > cfg.budget_tokens:
203
+ continue
204
+ est_total += tok
205
+ if cfg.explain_capsule:
206
+ content += f"\n\n<!-- why: inline; drift={drift}; tok={tok} -->"
207
+ lang = rec["path"].suffix.lstrip(".") or "text"
208
+ selected_blocks.append((rec["path"], lang, content))
209
+ selected_hashes.append(sh)
210
+
211
+ # Final reflection of accumulated statistics
212
+ stats.total_files_in_tree = len(files)
213
+ stats.total_omitted = max(0, len(files) - len(selected_blocks))
214
+ stats.total_with_contents = len(selected_blocks)
215
+ stats.est_tokens_prompt = est_total
216
+ # Note: stats.total_dirs accumulated during walk()
217
+
218
+ # Manifest
219
+ if cfg.emit_manifest:
220
+ file_manifest = []
221
+ for (p, lang, t) in selected_blocks:
222
+ entry = {"path": str(p.relative_to(root)), "mode": cfg.llm_mode}
223
+ try:
224
+ # Re-read file for sha256 to ensure it's always present
225
+ entry["sha256"] = sha256_bytes(p.read_bytes())
226
+ except Exception:
227
+ entry["sha256"] = None
228
+
229
+ if lang == "json":
230
+ try:
231
+ meta = json.loads(t)
232
+ entry.update(meta) # drift, etc.
233
+ except Exception:
234
+ pass
235
+ file_manifest.append(entry)
236
+
237
+ full_manifest = {
238
+ "stats": {
239
+ "total_dirs": stats.total_dirs,
240
+ "total_files_in_tree": stats.total_files_in_tree,
241
+ "total_omitted": stats.total_omitted,
242
+ "total_with_contents": stats.total_with_contents,
243
+ "est_tokens_prompt": stats.est_tokens_prompt,
244
+ },
245
+ "files": file_manifest
246
+ }
247
+ write_manifest(full_manifest, cfg.output.with_suffix('.manifest.json'))
248
+
249
+ return to_markdown(cfg, tree_lines, selected_blocks, stats)
src/dir2md/gitignore.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from typing import List, Optional, Callable
4
+
5
+ try:
6
+ from pathspec import PathSpec
7
+ except Exception:
8
+ PathSpec = None # type: ignore
9
+
10
+
11
+ def _collect_gitignore_lines(root: Path) -> List[str]:
12
+ lines: List[str] = []
13
+ for gi in root.rglob('.gitignore'):
14
+ rel_dir = gi.parent.relative_to(root)
15
+ base = str(rel_dir).replace('\\', '/')
16
+ raw = gi.read_text(encoding='utf-8', errors='ignore').splitlines()
17
+ for ln in raw:
18
+ s = ln.strip()
19
+ if not s or s.startswith('#'):
20
+ continue
21
+ if s.startswith('/'):
22
+ s = s[1:]
23
+ if base and s:
24
+ s = f"{base}/{s}"
25
+ lines.append(s)
26
+ return lines
27
+
28
+
29
+ def build_gitignore_matcher(root: Path) -> Optional[Callable[[str], bool]]:
30
+ if PathSpec is None:
31
+ return None
32
+ lines = _collect_gitignore_lines(root)
33
+ top = root / ".gitignore"
34
+ if top.exists():
35
+ lines = top.read_text(encoding='utf-8', errors='ignore').splitlines() + lines
36
+ if not lines:
37
+ return None
38
+ spec = PathSpec.from_lines("gitwildmatch", lines)
39
+ def match(relpath: str) -> bool:
40
+ return spec.match_file(relpath)
41
+ return match
src/dir2md/license.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """License and feature gating for dir2md open-core model"""
2
+ import os
3
+ from typing import Set
4
+
5
+ class LicenseManager:
6
+ """Manages feature access based on license type"""
7
+
8
+ def __init__(self):
9
+ self.license_key = os.environ.get('DIR2MD_LICENSE', '')
10
+ self.is_pro = self._validate_license()
11
+
12
+ def _validate_license(self) -> bool:
13
+ """Validate license key (simplified for demo)"""
14
+ # In production, this would validate ed25519 signature
15
+ return self.license_key.startswith('PRO-') and len(self.license_key) > 10
16
+
17
+ def get_available_features(self) -> Set[str]:
18
+ """Return set of available features based on license"""
19
+ base_features = {
20
+ 'basic_masking',
21
+ 'directory_scan',
22
+ 'gitignore_filter',
23
+ 'token_estimation',
24
+ 'simhash_dedup',
25
+ 'manifest_json',
26
+ 'deterministic_output',
27
+ 'basic_stats'
28
+ }
29
+
30
+ if self.is_pro:
31
+ pro_features = {
32
+ 'advanced_masking',
33
+ 'language_plugins',
34
+ 'parallel_processing',
35
+ 'incremental_cache',
36
+ 'drift_comparison',
37
+ 'html_pdf_export',
38
+ 'pr_integration',
39
+ 'tui_interface'
40
+ }
41
+ return base_features.union(pro_features)
42
+
43
+ return base_features
44
+
45
+ def check_feature(self, feature: str) -> bool:
46
+ """Check if a feature is available"""
47
+ return feature in self.get_available_features()
48
+
49
+ def require_pro(self, feature: str) -> None:
50
+ """Raise error if pro feature is accessed without license"""
51
+ if not self.check_feature(feature):
52
+ raise LicenseError(
53
+ f"Feature '{feature}' requires dir2md Pro license. "
54
+ f"Visit https://dir2md.com/pro for more information."
55
+ )
56
+
57
+ class LicenseError(Exception):
58
+ """Raised when trying to access pro features without license"""
59
+ pass
60
+
61
+ # Global license manager instance
62
+ license_manager = LicenseManager()
src/dir2md/manifest.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ import json, hashlib
4
+
5
+ def sha256_bytes(b: bytes) -> str:
6
+ return hashlib.sha256(b).hexdigest()
7
+
8
+ def write_manifest(data: dict, out: Path) -> None:
9
+ out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
src/dir2md/markdown.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from .core import Config, Stats
8
+
9
+ def to_markdown(cfg: 'Config', tree_lines: list[str], file_blocks: list[tuple[Path, str, str]], stats: 'Stats') -> str:
10
+ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
11
+ parts: list[str] = []
12
+ parts.append("# Project Blueprint\n")
13
+ parts.append(f"- Root: `{cfg.root}` ")
14
+ if not cfg.no_timestamp:
15
+ parts.append(f"- Generated: `{ts}` ")
16
+ parts.append(f"- Preset: `{cfg.preset}` ")
17
+ parts.append(f"- LLM mode: `{cfg.llm_mode}` ")
18
+ parts.append(f"- Estimated tokens (prompt): `{stats.est_tokens_prompt}` ")
19
+ parts.append("")
20
+ parts.append("## Directory Tree\n")
21
+ parts.append("```\n" + "\n".join(tree_lines) + "\n````\n\n")
22
+ if cfg.llm_mode != "off" and file_blocks:
23
+ parts.append("## File Contents\n")
24
+ for path, lang, text in file_blocks:
25
+ rel = path.relative_to(cfg.root)
26
+ parts.append(f"### File: `{rel}`\n")
27
+ parts.append(f"```{lang}\n{text}\n\n````\n")
28
+ if cfg.add_stats:
29
+ parts.append("## Summary\n")
30
+ parts.append("| metric | value |\n|---|---:|")
31
+ parts.append(f"| dirs | {stats.total_dirs} |")
32
+ parts.append(f"| files in tree | {stats.total_files_in_tree} |")
33
+ parts.append(f"| selected files | {stats.total_with_contents} |")
34
+ parts.append(f"| omitted | {stats.total_omitted} |")
35
+ parts.append(f"| est tokens (prompt) | {stats.est_tokens_prompt} |\n")
36
+ return "\n".join(parts)
src/dir2md/parallel.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parallel processing module (Pro feature)"""
2
+ from .license import license_manager, LicenseError
3
+
4
+ def parallel_file_processing(files, processor_func):
5
+ """Process files in parallel (Pro feature)"""
6
+ license_manager.require_pro('parallel_processing')
7
+
8
+ # This would contain actual parallel processing logic
9
+ # For demo, just show the restriction
10
+ from concurrent.futures import ThreadPoolExecutor
11
+
12
+ with ThreadPoolExecutor(max_workers=4) as executor:
13
+ return list(executor.map(processor_func, files))
14
+
15
+ def check_cache(file_path):
16
+ """Check if file is cached (Pro feature)"""
17
+ license_manager.require_pro('incremental_cache')
18
+
19
+ # Cache checking logic would go here
20
+ return False # Simplified for demo
src/dir2md/simhash.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import Iterable
3
+ import re, hashlib
4
+
5
+ _TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
6
+
7
+ def _tokens(s: str) -> list[str]:
8
+ return _TOKEN_RE.findall(s.lower())
9
+
10
+ def _shingles(seq: list[str], k: int = 4) -> Iterable[int]:
11
+ if k <= 0:
12
+ k = 4
13
+ for i in range(max(0, len(seq)-k+1)):
14
+ payload = " ".join(seq[i:i+k]).encode()
15
+ yield int.from_bytes(hashlib.blake2b(payload, digest_size=8).digest(), 'big')
16
+
17
+ def simhash64(s: str, k: int = 4) -> int:
18
+ v = [0]*64
19
+ for h in _shingles(_tokens(s), k=k):
20
+ for bit in range(64):
21
+ v[bit] += 1 if (h >> bit) & 1 else -1
22
+ out = 0
23
+ for bit in range(64):
24
+ if v[bit] > 0:
25
+ out |= (1<<bit)
26
+ return out
27
+
28
+ def hamming(a: int, b: int) -> int:
29
+ return (a ^ b).bit_count()
src/dir2md/summary.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ import ast
4
+
5
+ def summarize(path: Path, content: str, max_lines: int = 60) -> str:
6
+ ext = path.suffix.lower()
7
+ if ext == ".py":
8
+ try:
9
+ tree = ast.parse(content)
10
+ funcs = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
11
+ clss = [n.name for n in tree.body if isinstance(n, ast.ClassDef)]
12
+ imps = []
13
+ for n in tree.body:
14
+ if isinstance(n, ast.Import):
15
+ imps.extend([a.name for a in n.names])
16
+ if isinstance(n, ast.ImportFrom):
17
+ imps.extend([a.name for a in n.names])
18
+ lines = []
19
+ if imps: lines.append(f"- imports: {', '.join(imps)[:200]}")
20
+ if clss: lines.append(f"- classes: {', '.join(clss)[:200]}")
21
+ if funcs: lines.append(f"- functions: {', '.join(funcs)[:200]}")
22
+ return "\n".join(lines) or "- (no symbols)"
23
+ except Exception:
24
+ pass
25
+ if ext in {".md", ".markdown"}:
26
+ heads = [ln.strip() for ln in content.splitlines() if ln.strip().startswith("#")][:10]
27
+ return "\n".join([f"- {h}" for h in heads]) or _first_lines(content, max_lines)
28
+ return _first_lines(content, max_lines)
29
+
30
+ def _first_lines(content: str, max_lines: int) -> str:
31
+ lines = content.splitlines()[:max_lines]
32
+ return "\n".join([f"- {ln}" for ln in lines if ln.strip()])
src/dir2md/token.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ def estimate_tokens(text: str) -> int:
4
+ # Simple estimation: 4 chars ≈ 1 token
5
+ return max(1, (len(text) + 3)//4)
tests/test_dir2md.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import json, tempfile
3
+ from pathlib import Path
4
+ from dir2md.core import Config, generate_markdown_report
5
+
6
+
7
+ def _make_repo(tmp: Path) -> Path:
8
+ (tmp/"src").mkdir(parents=True, exist_ok=True)
9
+ # Make this file long enough to trigger truncation
10
+ long_content = "\n".join([f" print('line {i}')" for i in range(100)])
11
+ (tmp/"src"/"a.py").write_text(f"""
12
+ import os
13
+
14
+ class A: pass
15
+
16
+ def foo():
17
+ {long_content}
18
+ return 42
19
+ """, encoding="utf-8")
20
+ (tmp/"src"/"b.py").write_text("""
21
+ import sys
22
+
23
+ def bar():
24
+ return 43
25
+ """, encoding="utf-8")
26
+ # Similar file (for deduplication testing)
27
+ (tmp/"src"/"b_copy.py").write_text((tmp/"src"/"b.py").read_text(encoding="utf-8"), encoding="utf-8")
28
+ (tmp/"README.md").write_text("# Title\n\nSome text\n", encoding="utf-8")
29
+ return tmp
30
+
31
+
32
+ def test_budget_and_modes(tmp_path: Path):
33
+ root = _make_repo(tmp_path)
34
+ cfg = Config(
35
+ root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
36
+ respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
37
+ include_contents=True, only_ext=None, add_stats=True, add_toc=False,
38
+ llm_mode="summary", budget_tokens=200, max_file_tokens=1200, dedup_bits=16,
39
+ sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
40
+ preset="pro", explain_capsule=True,
41
+ )
42
+ md = generate_markdown_report(cfg)
43
+ assert "Estimated tokens (prompt):" in md
44
+ mpath = (root/"OUT.manifest.json")
45
+ assert mpath.exists()
46
+ man = json.loads(mpath.read_text(encoding="utf-8"))
47
+ # b_copy.py likely to be excluded due to deduplication
48
+ paths = {entry["path"] for entry in man["files"]}
49
+ assert any(p.endswith("a.py") for p in paths)
50
+ assert any(p.endswith("b.py") for p in paths)
51
+
52
+
53
+ def test_ref_mode_manifest(tmp_path: Path):
54
+ root = _make_repo(tmp_path)
55
+ cfg = Config(
56
+ root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
57
+ respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
58
+ include_contents=True, only_ext=None, add_stats=True, add_toc=False,
59
+ llm_mode="ref", budget_tokens=120, max_file_tokens=1200, dedup_bits=16,
60
+ sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
61
+ preset="pro", explain_capsule=False,
62
+ )
63
+ md = generate_markdown_report(cfg)
64
+ man = json.loads((root/"OUT.manifest.json").read_text(encoding="utf-8"))
65
+ assert "stats" in man
66
+ assert "files" in man
67
+ assert all("sha256" in e for e in man["files"])
68
+
69
+
70
+ def test_inline_sampling(tmp_path: Path):
71
+ root = _make_repo(tmp_path)
72
+ # Drastically reduced budget to trigger sampling
73
+ cfg = Config(
74
+ root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
75
+ respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=50,
76
+ include_contents=True, only_ext=None, add_stats=True, add_toc=False,
77
+ llm_mode="inline", budget_tokens=50, max_file_tokens=30, dedup_bits=0,
78
+ sample_head=5, sample_tail=3, strip_comments=False, emit_manifest=False,
79
+ preset="pro", explain_capsule=True,
80
+ )
81
+ md = generate_markdown_report(cfg)
82
+ assert "truncated middle" in md
83
+ assert "why: inline" in md
84
+
85
+ def test_masking(tmp_path: Path):
86
+ root = _make_repo(tmp_path)
87
+ # Add a file with a secret
88
+ secret_content = "My AWS key is AKIAIOSFODNN7EXAMPLE"
89
+ (root / ".env").write_text(secret_content, encoding="utf-8")
90
+
91
+ cfg = Config(
92
+ root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
93
+ respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
94
+ include_contents=True, only_ext=None, add_stats=True, add_toc=False,
95
+ llm_mode="inline", budget_tokens=1000, max_file_tokens=1000, dedup_bits=0,
96
+ sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=False,
97
+ preset="pro", explain_capsule=False, no_timestamp=True,
98
+ masking_mode="basic",
99
+ )
100
+ md = generate_markdown_report(cfg)
101
+
102
+ assert secret_content not in md
103
+ assert "[*** MASKED_SECRET ***]" in md
104
+
105
+ # Test with masking off
106
+ cfg.masking_mode = "off"
107
+ md_unmasked = generate_markdown_report(cfg)
108
+ assert secret_content in md_unmasked
109
+ assert "[*** MASKED_SECRET ***]" not in md_unmasked