#!/bin/bash # # BitTransformerLM Massive Scale Training Launcher # ================================================= # # Launches 1.21B parameter BitTransformerLM training across 4x NVIDIA L4 GPUs # with FSDP (Fully Sharded Data Parallel) for maximum efficiency. # set -e # Exit on any error echo "🚀 BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER" echo "==================================================" echo "Target: 680 MILLION parameters" echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)" echo "Dataset: WikiText-103 + Real Corpus Data" echo "Architecture: Reversible Transformer with Safety Telemetry" echo "" # Set environment variables export CUDA_VISIBLE_DEVICES=0,1,2,3 export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512 export NCCL_DEBUG=INFO export NCCL_TREE_THRESHOLD=0 # Set HuggingFace token export HF_TOKEN="${HF_TOKEN:-your-token-here}" # Change to BitTransformerLM directory cd /data/BitTransformerLM/BitTransformerLM # Create checkpoint directory mkdir -p /data/checkpoints # Check GPU availability echo "🔍 Checking GPU availability..." python -c " import torch print(f'CUDA Available: {torch.cuda.is_available()}') print(f'GPU Count: {torch.cuda.device_count()}') for i in range(torch.cuda.device_count()): print(f' GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)') " echo "" echo "📊 Model Configuration Preview:" echo " • Parameters: 679,630,848 (680M)" echo " • d_model: 1536" echo " • Layers: 24 (reversible)" echo " • Attention Heads: 24" echo " • Feed Forward: 6144" echo " • Sequence Length: 2048" echo " • Batch Size: 4 per GPU (16 total)" echo " • Gradient Accumulation: 32 steps" echo " • Effective Batch Size: 512" echo "" echo "🎯 Starting distributed training..." echo " Use Ctrl+C to stop training safely" echo "" # Launch distributed training with torchrun torchrun \ --nproc_per_node=4 \ --master_port=29500 \ --nnodes=1 \ --node_rank=0 \ massive_scale_training.py \ --world-size 4 \ --port 29500 echo "" echo "🏁 Training completed!" echo "Check /data/checkpoints/ for saved models" echo "Check /data/massive_scale_training.log for detailed logs"