File size: 2,234 Bytes
36c78b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
#
# BitTransformerLM Massive Scale Training Launcher
# =================================================
# 
# Launches 1.21B parameter BitTransformerLM training across 4x NVIDIA L4 GPUs
# with FSDP (Fully Sharded Data Parallel) for maximum efficiency.
#

set -e  # Exit on any error

echo "πŸš€ BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER"
echo "=================================================="
echo "Target: 680 MILLION parameters"
echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)"
echo "Dataset: WikiText-103 + Real Corpus Data"
echo "Architecture: Reversible Transformer with Safety Telemetry"
echo ""

# Set environment variables
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
export NCCL_DEBUG=INFO
export NCCL_TREE_THRESHOLD=0

# Set HuggingFace token
export HF_TOKEN="${HF_TOKEN:-your-token-here}"

# Change to BitTransformerLM directory
cd /data/BitTransformerLM/BitTransformerLM

# Create checkpoint directory
mkdir -p /data/checkpoints

# Check GPU availability
echo "πŸ” Checking GPU availability..."
python -c "
import torch
print(f'CUDA Available: {torch.cuda.is_available()}')
print(f'GPU Count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
    print(f'  GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)')
"

echo ""
echo "πŸ“Š Model Configuration Preview:"
echo "  β€’ Parameters: 679,630,848 (680M)" 
echo "  β€’ d_model: 1536"
echo "  β€’ Layers: 24 (reversible)"
echo "  β€’ Attention Heads: 24"
echo "  β€’ Feed Forward: 6144"
echo "  β€’ Sequence Length: 2048"
echo "  β€’ Batch Size: 4 per GPU (16 total)"
echo "  β€’ Gradient Accumulation: 32 steps"
echo "  β€’ Effective Batch Size: 512"
echo ""

echo "🎯 Starting distributed training..."
echo "   Use Ctrl+C to stop training safely"
echo ""

# Launch distributed training with torchrun
torchrun \
    --nproc_per_node=4 \
    --master_port=29500 \
    --nnodes=1 \
    --node_rank=0 \
    massive_scale_training.py \
    --world-size 4 \
    --port 29500

echo ""
echo "🏁 Training completed!"
echo "Check /data/checkpoints/ for saved models"
echo "Check /data/massive_scale_training.log for detailed logs"