File size: 2,015 Bytes
36c78b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/bin/bash
#
# Launch TRUE 1.21B Parameter BitTransformerLM Training
# ====================================================
#
# PROPER FSDP sharding across 4 GPUs + inference testing!
#
set -e
echo "π₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING"
echo "================================================="
echo "π― PROPER FSDP SHARDING (not duplication!)"
echo "β
Based on proven 680M success"
echo "π Full training + inference testing"
echo ""
# Optimal environment setup
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export OMP_NUM_THREADS=12
export HF_TOKEN="${HF_TOKEN:-your-token-here}"
cd /data/BitTransformerLM/BitTransformerLM
echo "π Hardware Check:"
python -c "
import torch
print(f'CUDA Available: {torch.cuda.is_available()}')
print(f'GPU Count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB')
"
echo ""
echo "βοΈ TRUE 1.21B CONFIGURATION:"
echo " π― Parameters: 1,210,000,000+ (1.21B)"
echo " π Architecture: d_model=2048, layers=24, heads=32"
echo " π§ Memory Strategy: FSDP Full Sharding across 4 GPUs"
echo " π Sequence Length: 512 (optimized from 680M success)"
echo " β‘ Mixed Precision: FP16"
echo " π‘οΈ Safety Telemetry: K, C, S metrics enabled"
echo " π§ All Optimizations: Reversible + Checkpointing + Chunked Attention"
echo ""
echo "π Starting TRUE 1.21B parameter training..."
echo " This WILL work - we've proven the capability!"
echo ""
# Launch training
python true_1b_training.py
echo ""
echo "π TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!"
echo "π Check /data/true_1b_results.json for full results"
echo "πΎ Model checkpoint saved for inference"
echo "π§ͺ Inference testing completed" |