|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
if [ $SLURM_LOCALID -eq 0 ]; then |
|
rm -rf /dev/shm/* |
|
rocm-smi || true |
|
else |
|
sleep 2 |
|
fi |
|
|
|
SLURM_JOB_NUM_NODES=64 |
|
export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3 |
|
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK |
|
export FI_CXI_DEFAULT_CQ_SIZE=262144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
module --quiet purge |
|
module load cray-python |
|
|
|
module load CrayEnv |
|
module load PrgEnv-cray/8.3.3 |
|
module load craype-accel-amd-gfx90a |
|
module load cray-python |
|
|
|
module use /pfs/lustrep2/projappl/project_462000125/samantao-public/mymodules |
|
module load suse-repo-deps/sam-default |
|
module load rocm/sam-5.2.3.lua |
|
module load rccl/sam-develop.lua |
|
module load aws-ofi-rccl/sam-default.lua |
|
|
|
source venv/bin/activate |
|
|
|
MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) |
|
MASTER_PORT=9999 |
|
|
|
echo "Launching on $SLURMD_NODENAME ($SLURM_PROCID/$SLURM_JOB_NUM_NODES)," \ |
|
"master $MASTER_NODE port $MASTER_PORT," \ |
|
"GPUs $SLURM_GPUS_ON_NODE," \ |
|
"CUDA: $(python -c 'import torch; print(torch.cuda.is_available())')" |
|
|
|
python -u -m torch.distributed.run \ |
|
--nnodes $SLURM_JOB_NUM_NODES \ |
|
--nproc_per_node $SLURM_GPUS_ON_NODE \ |
|
--node_rank=$SLURM_PROCID \ |
|
--master_addr $MASTER_NODE \ |
|
--master_port $MASTER_PORT \ |
|
"$@" |
|
|