|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
echo "π BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER" |
|
echo "==================================================" |
|
echo "Target: 680 MILLION parameters" |
|
echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)" |
|
echo "Dataset: WikiText-103 + Real Corpus Data" |
|
echo "Architecture: Reversible Transformer with Safety Telemetry" |
|
echo "" |
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3 |
|
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512 |
|
export NCCL_DEBUG=INFO |
|
export NCCL_TREE_THRESHOLD=0 |
|
|
|
|
|
export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
|
|
|
cd /data/BitTransformerLM/BitTransformerLM |
|
|
|
|
|
mkdir -p /data/checkpoints |
|
|
|
|
|
echo "π Checking GPU availability..." |
|
python -c " |
|
import torch |
|
print(f'CUDA Available: {torch.cuda.is_available()}') |
|
print(f'GPU Count: {torch.cuda.device_count()}') |
|
for i in range(torch.cuda.device_count()): |
|
print(f' GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)') |
|
" |
|
|
|
echo "" |
|
echo "π Model Configuration Preview:" |
|
echo " β’ Parameters: 679,630,848 (680M)" |
|
echo " β’ d_model: 1536" |
|
echo " β’ Layers: 24 (reversible)" |
|
echo " β’ Attention Heads: 24" |
|
echo " β’ Feed Forward: 6144" |
|
echo " β’ Sequence Length: 2048" |
|
echo " β’ Batch Size: 4 per GPU (16 total)" |
|
echo " β’ Gradient Accumulation: 32 steps" |
|
echo " β’ Effective Batch Size: 512" |
|
echo "" |
|
|
|
echo "π― Starting distributed training..." |
|
echo " Use Ctrl+C to stop training safely" |
|
echo "" |
|
|
|
|
|
torchrun \ |
|
--nproc_per_node=4 \ |
|
--master_port=29500 \ |
|
--nnodes=1 \ |
|
--node_rank=0 \ |
|
massive_scale_training.py \ |
|
--world-size 4 \ |
|
--port 29500 |
|
|
|
echo "" |
|
echo "π Training completed!" |
|
echo "Check /data/checkpoints/ for saved models" |
|
echo "Check /data/massive_scale_training.log for detailed logs" |