Torchrun

name: torch-distributed

resources:
    image_id: nvcr.io/nvidia/pytorch:23.10-py3
    accelerators: H100:8
    cpus: 60
    memory: 500
    labels:
      kueue.x-k8s.io/queue-name: user-queue
      maxRunDurationSeconds: "3200"

num_nodes: 2

run: |
    git clone https://github.com/roanakb/pytorch-distributed-resnet
    cd pytorch-distributed-resnet
    mkdir -p data  && mkdir -p saved_models && cd data && \
    wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
    tar -xvzf cifar-10-python.tar.gz
    python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS_PER_NODE \
    --nnodes=$NUM_NODES --node_rank=$RANK --master_addr=$MASTER_ADDR \
    --master_port=8008 resnet_ddp.py --num_epochs 20

See more detailed info here.

Schemas

Example Task Yamls

Example Deployment Yamls