#!/bin/bash # # Launch TRUE 1.21B Parameter BitTransformerLM Training # ==================================================== # # PROPER FSDP sharding across 4 GPUs + inference testing! # set -e echo "๐Ÿ”ฅ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING" echo "=================================================" echo "๐ŸŽฏ PROPER FSDP SHARDING (not duplication!)" echo "โœ… Based on proven 680M success" echo "๐Ÿš€ Full training + inference testing" echo "" # Optimal environment setup export CUDA_VISIBLE_DEVICES=0,1,2,3 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export OMP_NUM_THREADS=12 export HF_TOKEN="${HF_TOKEN:-your-token-here}" cd /data/BitTransformerLM/BitTransformerLM echo "๐Ÿ” Hardware Check:" python -c " import torch print(f'CUDA Available: {torch.cuda.is_available()}') print(f'GPU Count: {torch.cuda.device_count()}') for i in range(torch.cuda.device_count()): props = torch.cuda.get_device_properties(i) print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB') " echo "" echo "โš™๏ธ TRUE 1.21B CONFIGURATION:" echo " ๐ŸŽฏ Parameters: 1,210,000,000+ (1.21B)" echo " ๐Ÿ“ Architecture: d_model=2048, layers=24, heads=32" echo " ๐Ÿง  Memory Strategy: FSDP Full Sharding across 4 GPUs" echo " ๐Ÿ”„ Sequence Length: 512 (optimized from 680M success)" echo " โšก Mixed Precision: FP16" echo " ๐Ÿ›ก๏ธ Safety Telemetry: K, C, S metrics enabled" echo " ๐Ÿ”ง All Optimizations: Reversible + Checkpointing + Chunked Attention" echo "" echo "๐Ÿš€ Starting TRUE 1.21B parameter training..." echo " This WILL work - we've proven the capability!" echo "" # Launch training python true_1b_training.py echo "" echo "๐Ÿ† TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!" echo "๐Ÿ“Š Check /data/true_1b_results.json for full results" echo "๐Ÿ’พ Model checkpoint saved for inference" echo "๐Ÿงช Inference testing completed"