|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
echo "π₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING" |
|
echo "=================================================" |
|
echo "π― PROPER FSDP SHARDING (not duplication!)" |
|
echo "β
Based on proven 680M success" |
|
echo "π Full training + inference testing" |
|
echo "" |
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3 |
|
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
export OMP_NUM_THREADS=12 |
|
export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
|
cd /data/BitTransformerLM/BitTransformerLM |
|
|
|
echo "π Hardware Check:" |
|
python -c " |
|
import torch |
|
print(f'CUDA Available: {torch.cuda.is_available()}') |
|
print(f'GPU Count: {torch.cuda.device_count()}') |
|
for i in range(torch.cuda.device_count()): |
|
props = torch.cuda.get_device_properties(i) |
|
print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') |
|
print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB') |
|
" |
|
|
|
echo "" |
|
echo "βοΈ TRUE 1.21B CONFIGURATION:" |
|
echo " π― Parameters: 1,210,000,000+ (1.21B)" |
|
echo " π Architecture: d_model=2048, layers=24, heads=32" |
|
echo " π§ Memory Strategy: FSDP Full Sharding across 4 GPUs" |
|
echo " π Sequence Length: 512 (optimized from 680M success)" |
|
echo " β‘ Mixed Precision: FP16" |
|
echo " π‘οΈ Safety Telemetry: K, C, S metrics enabled" |
|
echo " π§ All Optimizations: Reversible + Checkpointing + Chunked Attention" |
|
echo "" |
|
|
|
echo "π Starting TRUE 1.21B parameter training..." |
|
echo " This WILL work - we've proven the capability!" |
|
echo "" |
|
|
|
|
|
python true_1b_training.py |
|
|
|
echo "" |
|
echo "π TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!" |
|
echo "π Check /data/true_1b_results.json for full results" |
|
echo "πΎ Model checkpoint saved for inference" |
|
echo "π§ͺ Inference testing completed" |