#!/bin/bash # # BitTransformerLM OPTIMIZED Massive Scale Training Launcher # ========================================================== # # Launches 680M parameter BitTransformerLM with ALL optimizations enabled! # Uses DataParallel for reliable multi-GPU training. # set -e # Exit on any error echo "🚀 BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING" echo "=====================================================" echo "Target: 680 MILLION parameters (CONFIRMED!)" echo "Hardware: Multi-GPU with DataParallel" echo "Dataset: WikiText-103 with bit-level encoding" echo "Optimizations: ALL ENABLED!" echo "" # Set environment variables for optimal performance export CUDA_VISIBLE_DEVICES=0,1,2,3 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export OMP_NUM_THREADS=12 # Set HuggingFace token export HF_TOKEN="${HF_TOKEN:-your-token-here}" # Change to BitTransformerLM directory cd /data/BitTransformerLM/BitTransformerLM # Create checkpoint directory mkdir -p /data/checkpoints echo "🔍 Hardware Check:" python -c " import torch print(f'CUDA Available: {torch.cuda.is_available()}') print(f'GPU Count: {torch.cuda.device_count()}') for i in range(torch.cuda.device_count()): props = torch.cuda.get_device_properties(i) print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') " echo "" echo "⚙️ OPTIMIZATIONS ENABLED:" echo " ✅ Reversible Layers (50% memory savings)" echo " ✅ Gradient Checkpointing" echo " ✅ Mixed Precision (FP16)" echo " ✅ Memory-Mapped Dataset Loading" echo " ✅ Safety Telemetry (K, C, S metrics)" echo " ✅ Bit-Native Processing" echo " ✅ DataParallel Multi-GPU" echo "" echo "📊 Training Configuration:" echo " • Parameters: 679,962,626 (680M)" echo " • Architecture: d_model=1536, layers=24, heads=24" echo " • Batch Size: 2 per GPU" echo " • Gradient Accumulation: 16 steps" echo " • Effective Batch Size: 128" echo " • Learning Rate: 3e-4 with OneCycle" echo " • Dataset: WikiText-103 (2000 training samples)" echo "" echo "🎯 Starting optimized training..." echo " This version should train successfully!" echo "" # Launch optimized training python massive_scale_simple.py echo "" echo "🏁 Training completed successfully!" echo "Check /data/checkpoints/ for saved models"