diff --git a/tools/scripts/slurm_test_mgpu.sh b/tools/scripts/slurm_test_mgpu.sh new file mode 100644 index 0000000..7e1a1d6 --- /dev/null +++ b/tools/scripts/slurm_test_mgpu.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +GPUS=$2 +GPUS_PER_NODE=$GPUS +PY_ARGS=${@:3} +JOB_NAME=eval +SRUN_ARGS=${SRUN_ARGS:-""} + +while true +do + PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 )) + status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)" + if [ "${status}" != "0" ]; then + break; + fi +done +echo $PORT + +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u test.py --launcher slurm --tcp_port $PORT ${PY_ARGS} +