|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
echo "π BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING" |
|
echo "=====================================================" |
|
echo "Target: 680 MILLION parameters (CONFIRMED!)" |
|
echo "Hardware: Multi-GPU with DataParallel" |
|
echo "Dataset: WikiText-103 with bit-level encoding" |
|
echo "Optimizations: ALL ENABLED!" |
|
echo "" |
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3 |
|
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
export OMP_NUM_THREADS=12 |
|
|
|
|
|
export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
|
|
|
cd /data/BitTransformerLM/BitTransformerLM |
|
|
|
|
|
mkdir -p /data/checkpoints |
|
|
|
echo "π Hardware Check:" |
|
python -c " |
|
import torch |
|
print(f'CUDA Available: {torch.cuda.is_available()}') |
|
print(f'GPU Count: {torch.cuda.device_count()}') |
|
for i in range(torch.cuda.device_count()): |
|
props = torch.cuda.get_device_properties(i) |
|
print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') |
|
" |
|
|
|
echo "" |
|
echo "βοΈ OPTIMIZATIONS ENABLED:" |
|
echo " β
Reversible Layers (50% memory savings)" |
|
echo " β
Gradient Checkpointing" |
|
echo " β
Mixed Precision (FP16)" |
|
echo " β
Memory-Mapped Dataset Loading" |
|
echo " β
Safety Telemetry (K, C, S metrics)" |
|
echo " β
Bit-Native Processing" |
|
echo " β
DataParallel Multi-GPU" |
|
echo "" |
|
|
|
echo "π Training Configuration:" |
|
echo " β’ Parameters: 679,962,626 (680M)" |
|
echo " β’ Architecture: d_model=1536, layers=24, heads=24" |
|
echo " β’ Batch Size: 2 per GPU" |
|
echo " β’ Gradient Accumulation: 16 steps" |
|
echo " β’ Effective Batch Size: 128" |
|
echo " β’ Learning Rate: 3e-4 with OneCycle" |
|
echo " β’ Dataset: WikiText-103 (2000 training samples)" |
|
echo "" |
|
|
|
echo "π― Starting optimized training..." |
|
echo " This version should train successfully!" |
|
echo "" |
|
|
|
|
|
python massive_scale_simple.py |
|
|
|
echo "" |
|
echo "π Training completed successfully!" |
|
echo "Check /data/checkpoints/ for saved models" |