Perform a multinode job with torchrun
name: torch-distributed
resources:
image_id: nvcr.io/nvidia/pytorch:23.10-py3
accelerators: H100:8
cpus: 60
memory: 500
labels:
kueue.x-k8s.io/queue-name: user-queue
maxRunDurationSeconds: "3200"
num_nodes: 2
run: |
git clone https://github.com/roanakb/pytorch-distributed-resnet
cd pytorch-distributed-resnet
mkdir -p data && mkdir -p saved_models && cd data && \
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
tar -xvzf cifar-10-python.tar.gz
python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS_PER_NODE \
--nnodes=$NUM_NODES --node_rank=$RANK --master_addr=$MASTER_ADDR \
--master_port=8008 resnet_ddp.py --num_epochs 20