MPI
MPI
Example task yamls for MPI
Prerequisites
Config.yaml (~/.konduktor/config.yaml
)
Copy
ssh:
enable: true
Current Working Directory
Copy
$ ls
task.yaml
Launching
Copy
$ konduktor launch task.yaml
Task.yaml
Copy
name: mpirun-hostname
num_nodes: 2
resources:
cpus: 1
memory: 2
image_id: ubuntu
labels:
kueue.x-k8s.io/queue-name: user-queue
maxRunDurationSeconds: "3200"
run: |
echo "Starting workload container on for ${NUM_NODES} benchmark"
# Install ping
apt update -y
apt install -y iputils-ping
# optionally install this if your container already builds this
apt install -y openmpi-bin libopenmpi-dev
# For every worker, wait till online and add to hostfile
for HOST in $(echo "${NODE_HOST_IPS}" | sed "s/,/ /g"); do
until ssh -p 2222 -i ~/.ssh/konduktor-key -o StrictHostKeyChecking=no ${HOST} -- 'which orted'; do
echo Waiting for ${HOST}...
sleep 10
done
echo "${HOST} port=2222 slots=8" | tee -a /tmp/hostfile;
done
cat /tmp/hostfile
# Launch from head node
if [[ "${RANK}" -eq "0" ]]; then
mpirun -np $(("$NUM_NODES" * 8)) \
--hostfile /tmp/hostfile \
-mca plm_rsh_no_tree_spawn 1 \
--mca orte_keep_fqdn_hostnames 1 \
--mca btl self,tcp \
--mca btl_tcp_if_include eth0 \
--allow-run-as-root \
--bind-to none \
--mca orte_base_help_aggregate 0 \
--mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 2222 -i ~/.ssh/konduktor-key -o UserKnownHostsFile=/dev/null" \
hostname
else
while ping -c 1 ${MASTER_ADDR}; do
sleep 5
done
fi
exit 0
Assistant
Responses are generated using AI and may contain mistakes.