|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export PATH=/usr/local/cuda/bin:$PATH |
|
timestamp=$(date +"%Y%m%d-%H%M") |
|
|
|
|
|
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) |
|
|
|
export MASTER_PORT=$(python - <<EOF |
|
import socket |
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
sock.bind(('', 0)) # OS will allocate a free port |
|
free_port = sock.getsockname()[1] |
|
sock.close() |
|
print(free_port) |
|
EOF |
|
) |
|
|
|
|
|
echo "Master node: $MASTER_ADDR" |
|
echo "Master port: $MASTER_PORT" |
|
echo "Number of nodes: $SLURM_NNODES" |
|
echo "GPUs per node: $SLURM_GPUS_ON_NODE" |
|
|
|
config=configs/sam2.1_hiera_tiny_finetune512.yaml |
|
output_path=./exp_log/mnode_tiny |
|
|
|
|
|
srun --exclusive python training/train.py \ |
|
-c $config \ |
|
--output-path $output_path \ |
|
--use-cluster 0 \ |
|
--num-gpus $SLURM_GPUS_ON_NODE \ |
|
--num-nodes $SLURM_NNODES \ |
|
--master-addr $MASTER_ADDR \ |
|
--main-port $MASTER_PORT |
|
|
|
echo "training done" |
|
|
|
|
|
|