From b4aefdc5f862b206fff4838625028568a3b24c5e Mon Sep 17 00:00:00 2001 From: inter Date: Sun, 21 Sep 2025 20:19:35 +0800 Subject: [PATCH] Add File --- tools/scripts/slurm_train.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tools/scripts/slurm_train.sh diff --git a/tools/scripts/slurm_train.sh b/tools/scripts/slurm_train.sh new file mode 100644 index 0000000..7a4533f --- /dev/null +++ b/tools/scripts/slurm_train.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +GPUS=$3 +PY_ARGS=${@:4} + +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} + +while true +do + PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 )) + status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)" + if [ "${status}" != "0" ]; then + break; + fi +done +echo $PORT + +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u train.py --launcher slurm --tcp_port $PORT ${PY_ARGS}