File size: 2,234 Bytes
36c78b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
#!/bin/bash
#
# BitTransformerLM Massive Scale Training Launcher
# =================================================
#
# Launches 1.21B parameter BitTransformerLM training across 4x NVIDIA L4 GPUs
# with FSDP (Fully Sharded Data Parallel) for maximum efficiency.
#
set -e # Exit on any error
echo "π BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER"
echo "=================================================="
echo "Target: 680 MILLION parameters"
echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)"
echo "Dataset: WikiText-103 + Real Corpus Data"
echo "Architecture: Reversible Transformer with Safety Telemetry"
echo ""
# Set environment variables
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
export NCCL_DEBUG=INFO
export NCCL_TREE_THRESHOLD=0
# Set HuggingFace token
export HF_TOKEN="${HF_TOKEN:-your-token-here}"
# Change to BitTransformerLM directory
cd /data/BitTransformerLM/BitTransformerLM
# Create checkpoint directory
mkdir -p /data/checkpoints
# Check GPU availability
echo "π Checking GPU availability..."
python -c "
import torch
print(f'CUDA Available: {torch.cuda.is_available()}')
print(f'GPU Count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
print(f' GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)')
"
echo ""
echo "π Model Configuration Preview:"
echo " β’ Parameters: 679,630,848 (680M)"
echo " β’ d_model: 1536"
echo " β’ Layers: 24 (reversible)"
echo " β’ Attention Heads: 24"
echo " β’ Feed Forward: 6144"
echo " β’ Sequence Length: 2048"
echo " β’ Batch Size: 4 per GPU (16 total)"
echo " β’ Gradient Accumulation: 32 steps"
echo " β’ Effective Batch Size: 512"
echo ""
echo "π― Starting distributed training..."
echo " Use Ctrl+C to stop training safely"
echo ""
# Launch distributed training with torchrun
torchrun \
--nproc_per_node=4 \
--master_port=29500 \
--nnodes=1 \
--node_rank=0 \
massive_scale_training.py \
--world-size 4 \
--port 29500
echo ""
echo "π Training completed!"
echo "Check /data/checkpoints/ for saved models"
echo "Check /data/massive_scale_training.log for detailed logs" |