File size: 7,777 Bytes
216326b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
#!/usr/bin/env python3
"""
Sync BitTransformerLM repository to HuggingFace Hub for OS launch.
Uploads all cleaned documentation and code with proper commit message.
"""
import os
import logging
from pathlib import Path
from huggingface_hub import HfApi, login
from typing import Optional, List
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def sync_repository_to_hf(
repo_id: str = "WCNegentropy/BitTransformerLM",
token: Optional[str] = None,
commit_message: str = "π OS Launch: Clean documentation and refined licensing"
):
"""
Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub.
Args:
repo_id: HuggingFace repository ID
token: HF token (defaults to HF_TOKEN environment variable)
commit_message: Commit message for the upload
"""
# Get token from environment if not provided
if token is None:
token = os.environ.get('HF_TOKEN')
if not token:
logger.error("HF_TOKEN environment variable not set and no token provided")
return False
try:
# Login to HuggingFace
login(token=token)
api = HfApi()
logger.info("Successfully authenticated with HuggingFace Hub")
# Get the repository root directory
repo_root = Path(__file__).parent
logger.info(f"Repository root: {repo_root}")
# Files and directories to upload (excluding unnecessary files)
include_patterns = [
# Core code
"bit_transformer/**/*.py",
"tests/**/*.py",
"*.py", # Root level Python files
# Documentation (cleaned)
"README.md",
"MODEL_CARD.md",
"RESEARCH_STATUS.md",
"EMPIRICAL_VALIDATION.md",
"OPEN_SOURCE_LAUNCH.md",
"AGENTS.md",
# Configuration
"requirements.txt",
"pyproject.toml",
"Dockerfile",
"start.sh",
# License files (cleaned)
"LICENSE/**/*.txt",
]
# Files to exclude
exclude_patterns = [
"__pycache__/**",
"*.pyc",
".git/**",
".pytest_cache/**",
"weights/**",
"checkpoints/**",
"*.log",
# Outdated documentation
"BitTransformerLM_full_assessment.md",
"FORENSIC_*.md",
"state_of_the_repo_audit.md",
# Old upload script
"upload_to_hf.py",
]
# Get all files to upload
files_to_upload = []
for pattern in include_patterns:
for file_path in repo_root.glob(pattern):
if file_path.is_file():
# Check if file should be excluded
relative_path = file_path.relative_to(repo_root)
should_exclude = any(
relative_path.match(exclude)
for exclude in exclude_patterns
)
if not should_exclude:
files_to_upload.append(file_path)
logger.info(f"Found {len(files_to_upload)} files to upload")
# Upload files in batches
uploaded_count = 0
for file_path in files_to_upload:
try:
relative_path = file_path.relative_to(repo_root)
logger.info(f"Uploading: {relative_path}")
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=str(relative_path),
repo_id=repo_id,
repo_type="model",
commit_message=commit_message,
commit_description="""
This OS launch commit includes:
β
**Cleaned Documentation**
- Removed inflated claims and marketing language
- Added honest research status and limitations
- Created professional model card and validation reports
- Streamlined licensing to AGPLv3 + commercial contact
β
**Refined Codebase**
- Complete experimental bit-native transformer implementation
- 57 Python files with comprehensive research framework
- Safety telemetry and monitoring systems
- Distributed training and development tools
β
**Professional Standards**
- Empirical validation of all claims
- Clear experimental vs production distinctions
- Rigorous research methodology requirements
- Community contribution framework
Ready for serious research evaluation and academic investigation.
""".strip()
)
uploaded_count += 1
if uploaded_count % 10 == 0:
logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded")
except Exception as e:
logger.warning(f"Failed to upload {relative_path}: {e}")
continue
logger.info(f"β
Successfully uploaded {uploaded_count}/{len(files_to_upload)} files")
logger.info(f"π Repository synced to: https://huggingface.co/{repo_id}")
return True
except Exception as e:
logger.error(f"β Failed to sync repository: {e}")
return False
def create_release_info():
"""Create a release information file for the OS launch."""
release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release
**Release Date:** August 2025
**Status:** Open Source Research Implementation
**License:** AGPLv3 + Commercial Licensing Available
## What's Included
This release provides a complete experimental framework for bit-native language modeling research:
- **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers
- **Safety Systems:** Real-time K/C/S telemetry and monitoring
- **Research Tools:** Interactive dashboard, distributed training, comprehensive testing
- **Documentation:** Professional model card, research status, and validation reports
## Important Notes
β οΈ **Experimental Status:** This is research code requiring rigorous baseline validation
β οΈ **Not Production Ready:** Needs extensive evaluation vs standard transformers
β οΈ **Research Use Only:** Intended for academic investigation and experimentation
## Licensing
- **Open Source:** AGPLv3 for research and open source use
- **Commercial:** Contact [email protected] for commercial licensing
## Next Steps
The research community is invited to:
1. Conduct rigorous baseline comparisons vs standard transformers
2. Evaluate on established language modeling benchmarks
3. Validate (or refute) claimed memory efficiency benefits
4. Share findings openly to advance the field
**Research responsibly. Validate rigorously. Share openly.**
"""
release_file = Path(__file__).parent / "RELEASE_INFO.md"
with open(release_file, 'w') as f:
f.write(release_info)
logger.info("Created RELEASE_INFO.md")
return release_file
if __name__ == "__main__":
# Create release info file
create_release_info()
# Sync to HuggingFace
success = sync_repository_to_hf()
if success:
print("\nπ BitTransformerLM OS Launch Sync Complete!")
print("π Repository: https://huggingface.co/WCNegentropy/BitTransformerLM")
print("π§ Commercial inquiries: [email protected]")
print("\nReady for research community evaluation! π§ͺβ¨")
else:
print("\nβ Sync failed. Please check logs and try again.") |