Prerequisites

Config.yaml (~/.konduktor/config.yaml)

ssh:
  enable: true

Current Working Directory

$ ls
task.yaml

Launching

$ konduktor launch task.yaml

Task.yaml

name: mpirun-hostname 

num_nodes: 2

resources:
  cpus: 1
  memory: 2
  image_id: ubuntu
  labels:
    kueue.x-k8s.io/queue-name: user-queue
    maxRunDurationSeconds: "3200"

run: |
  echo "Starting workload container on for ${NUM_NODES} benchmark"
  # Install ping
  apt update -y
  apt install -y iputils-ping 
  # optionally install this if your container already builds this
  apt install -y openmpi-bin libopenmpi-dev

  # For every worker, wait till online and add to hostfile
  for HOST in $(echo "${NODE_HOST_IPS}" | sed "s/,/ /g"); do
    until ssh -p 2222 -i ~/.ssh/konduktor-key -o StrictHostKeyChecking=no ${HOST} -- 'which orted'; do
      echo Waiting for ${HOST}...
      sleep 10
    done
    echo "${HOST} port=2222 slots=8" | tee -a /tmp/hostfile;
  done
  
  cat /tmp/hostfile

  # Launch from head node
  if [[ "${RANK}" -eq "0" ]]; then

    mpirun -np $(("$NUM_NODES" * 8)) \
      --hostfile /tmp/hostfile \
      -mca plm_rsh_no_tree_spawn 1 \
      --mca orte_keep_fqdn_hostnames 1 \
      --mca btl self,tcp \
      --mca btl_tcp_if_include eth0 \
      --allow-run-as-root \
      --bind-to none \
      --mca orte_base_help_aggregate 0 \
      --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 2222 -i ~/.ssh/konduktor-key -o UserKnownHostsFile=/dev/null" \
      hostname

  else
    while ping -c 1 ${MASTER_ADDR}; do
      sleep 5
    done
  fi

  exit 0