Trainy-Konduktor: An ML/AI GPU platform for high performance batch jobs on k8s.
bash
# my_task.yaml name: tune num_nodes: 2 # scale up your workload resources: cpus: 15 memory: 90 accelerators: H100:8 image_id: gcr.io/k8s-staging-jobset/pytorch-mnist:latest labels: kueue.x-k8s.io/queue-name: user-queue maxRunDurationSeconds: "3200" run: | set -e NCCL_DEBUG=INFO torchrun --rdzv_id=123 --nnodes=$NUM_NODES --nproc_per_node=1 --master_addr=$MASTER_ADDR --master_port=1234 --node_rank=$RANK /workspace/mnist.py
$ konduktor launch my_task.yaml