File size: 2,274 Bytes
759dfe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#! /bin/bash
# Enable job control and set process group
set -m
trap 'kill $(jobs -p)' EXIT INT TERM
# Get the directory where this script is located
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
DEFAULT_CONFIG="${SCRIPT_DIR}/configs/finetune.yaml"
# Parse command line arguments
usage() {
echo "Usage: $0 [-c|--config <config_path>] [-n|--num-gpus <num_gpus>]"
echo " -c, --config Path to config file (default: ${DEFAULT_CONFIG})"
echo " -n, --num-gpus Number of GPUs to use (default: 8)"
exit 1
}
# Default values
CONFIG_PATH="${DEFAULT_CONFIG}"
NUM_GPUS=8
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
-c|--config)
CONFIG_PATH="$2"
shift 2
;;
-n|--num-gpus)
NUM_GPUS="$2"
shift 2
;;
-h|--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Validate config file exists
if [ ! -f "${CONFIG_PATH}" ]; then
echo "Config file not found at ${CONFIG_PATH}"
exit 1
fi
# Validate num_gpus is a positive integer
if ! [[ "$NUM_GPUS" =~ ^[1-9][0-9]*$ ]]; then
echo "Number of GPUs must be a positive integer"
exit 1
fi
# Set distributed training environment variables
export MASTER_PORT=29500
export MASTER_ADDR="localhost"
export WORLD_SIZE=$NUM_GPUS
export TF_CPP_MIN_LOG_LEVEL=3
export COMPILE_DIT=1
# Set IS_DISTRIBUTED based on NUM_GPUS
if [ "$NUM_GPUS" -gt 1 ]; then
export IS_DISTRIBUTED=true
fi
# Load .env file (if it exists)
if [ -f ".env" ]; then
export $(grep -v '^#' .env | xargs)
fi
echo "Starting training with ${NUM_GPUS} GPU(s), mode: ${IS_DISTRIBUTED:+distributed}${IS_DISTRIBUTED:-single_gpu}"
echo "Using config: ${CONFIG_PATH}"
# Launch processes
if [ "$NUM_GPUS" -gt 1 ]; then
for RANK in $(seq 0 $((NUM_GPUS-1))); do
env RANK=$RANK CUDA_VISIBLE_DEVICES=$RANK python "${SCRIPT_DIR}/train.py" --config-path "${CONFIG_PATH}" &
done
else
python "${SCRIPT_DIR}/train.py" --config-path "${CONFIG_PATH}" &
fi
# Wait for all background processes to complete
wait
# Check if any process failed
if [ $? -ne 0 ]; then
echo "One or more training processes failed"
exit 1
fi |