File size: 2,015 Bytes
36c78b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
#
# Launch TRUE 1.21B Parameter BitTransformerLM Training
# ====================================================
# 
# PROPER FSDP sharding across 4 GPUs + inference testing!
#

set -e

echo "πŸ”₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING"
echo "================================================="
echo "🎯 PROPER FSDP SHARDING (not duplication!)"
echo "βœ… Based on proven 680M success"
echo "πŸš€ Full training + inference testing"
echo ""

# Optimal environment setup
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export OMP_NUM_THREADS=12
export HF_TOKEN="${HF_TOKEN:-your-token-here}"

cd /data/BitTransformerLM/BitTransformerLM

echo "πŸ” Hardware Check:"
python -c "
import torch
print(f'CUDA Available: {torch.cuda.is_available()}')
print(f'GPU Count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f'  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB')
"

echo ""
echo "βš™οΈ TRUE 1.21B CONFIGURATION:"
echo "  🎯 Parameters: 1,210,000,000+ (1.21B)"
echo "  πŸ“ Architecture: d_model=2048, layers=24, heads=32"
echo "  🧠 Memory Strategy: FSDP Full Sharding across 4 GPUs"
echo "  πŸ”„ Sequence Length: 512 (optimized from 680M success)"
echo "  ⚑ Mixed Precision: FP16"
echo "  πŸ›‘οΈ Safety Telemetry: K, C, S metrics enabled"
echo "  πŸ”§ All Optimizations: Reversible + Checkpointing + Chunked Attention"
echo ""

echo "πŸš€ Starting TRUE 1.21B parameter training..."
echo "   This WILL work - we've proven the capability!"
echo ""

# Launch training
python true_1b_training.py

echo ""
echo "πŸ† TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!"
echo "πŸ“Š Check /data/true_1b_results.json for full results"
echo "πŸ’Ύ Model checkpoint saved for inference"
echo "πŸ§ͺ Inference testing completed"