Production-ready RL training library for large language models (LLMs) by VeRL.
verl.yaml
verl.yaml
job definition yaml for verl multinode
# Multi-node distributed training with Verl (Volcano Engine Reinforcement Learning) framework.
#
# Verl is a flexible and efficient reinforcement learning framework designed for
# training large language models with RLHF (Reinforcement Learning from Human Feedback).
# This example demonstrates multi-node training using PPO on the GSM8K dataset.
#
# Prerequisites:
# - Access to Hugging Face models (Qwen/Qwen2.5-0.5B-Instruct in this example)
#
# Usage:
# # Launch a 2-node training cluster:
# $ konduktor launch examples/verl/multinode.yaml
#
# # Stream logs:
# $ konduktor logs verl-cluster-*
#
# # Cleanup:
# $ konduktor down verl-cluster-*
# Known Issues: Need to set GLOO_SOCKET_IFNAME to primary network IF which changes
# per platform
name: verl-multinode-training
resources:
accelerators: H200:8
image_id: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
cpus: 180
memory: 1000
labels:
kueue.x-k8s.io/queue-name: user-queue
maxRunDurationSeconds: "320000"
num_nodes: 2 # Number of nodes for distributed training
# Environment variables
envs:
HF_HUB_ENABLE_HF_TRANSFER: "1"
TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
# Optional: Add your W&B API key for experiment tracking
WANDB_API_KEY: <YOUR_WANDB_KEY>
# Training configuration
MODEL_NAME: Qwen/Qwen2.5-0.5B-Instruct
TOTAL_EPOCHS: 3
ACTOR_LR: 1e-6
CRITIC_LR: 1e-5
run: |
set -x
export PIP_CONSTRAINT=""
mkdir -p $HOME/checkpoints
apt update -y
apt install -y iputils-ping
git clone https://github.com/volcengine/verl.git
cd verl
# Install Verl and its dependencies (skip Megatron for this example)
#USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh
pip install --no-deps -e .
#pip install "ray[default]" # For Ray dashboard
# Set up distributed training environment
echo "Head IP: $MASTER_ADDR"
echo "Number of nodes: $NUM_NODES"
# Create custom runtime environment configuration
cat > runtime_env_custom.yaml <<EOF
working_dir: ./
excludes: ["/.git/", "*.whl", "**/*.whl"]
env_vars:
TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
CUDA_DEVICE_MAX_CONNECTIONS: "1"
HF_HUB_ENABLE_HF_TRANSFER: "1"
EOF
# Ray cluster configuration
HEAD_PORT=6385
DASH_PORT=8280
# Function to check if Ray is already running
is_ray_alive () {
ray status --address="$1:$HEAD_PORT" >/dev/null 2>&1
}
if [ "$RANK" == "0" ]; then
# Head node: prepare data, download model, start Ray head, and submit training job
echo "Setting up head node..."
# Install additional dependencies for data processing
pip install datasets transformers
# Prepare GSM8K dataset
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
# Download model to cache
python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-0.5B-Instruct')"
# Start Ray head node if not already running
if ! is_ray_alive "$MASTER_ADDR"; then
echo "Starting Ray head node..."
ray start --head --node-ip-address="$MASTER_ADDR" \
--port $HEAD_PORT --dashboard-port $DASH_PORT \
--dashboard-host=0.0.0.0 \
--dashboard-agent-listen-port=52366 \
--disable-usage-stats \
--num-gpus=$NUM_GPUS_PER_NODE
sleep 10
else
echo "Ray is already running at $MASTER_ADDR:$HEAD_PORT, reusing existing instance"
ray status --address="$MASTER_ADDR:$HEAD_PORT"
fi
ray status
ibdev2netdev
# Submit the training job to Ray
export RAY_ADDRESS="http://localhost:$DASH_PORT"
echo "Submitting training job to Ray cluster..."
HYDRA_FULL_ERROR=1 ray job submit --address="$RAY_ADDRESS" --working-dir=. \
--runtime-env=runtime_env_custom.yaml \
-- python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=128 \
data.max_prompt_length=512 \
data.max_response_length=256 \
actor_rollout_ref.model.path=$MODEL_NAME \
actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=$CRITIC_LR \
critic.model.path=$MODEL_NAME \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.ppo_mini_batch_size=64 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.project_name=ppo_training \
trainer.experiment_name=qwen-2.5-0.5B \
trainer.val_before_train=False \
trainer.n_gpus_per_node=$NUM_GPUS_PER_NODE \
trainer.nnodes=$NUM_NODES \
trainer.default_local_dir=$HOME/checkpoints \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=$TOTAL_EPOCHS \
trainer.logger=['console'] \
trainer.resume_mode=auto 2>&1 | tee verl_training.log
# To enable W&B logging:
# 1. Set WANDB_API_KEY in envs or pass via --secret WANDB_API_KEY
# 2. Change trainer.logger to: trainer.logger=['console', 'wandb']
# 3. Add: trainer.wandb_project='verl-rlhf'
exit 1 # this forces the other workers to shutdown
else
# Worker nodes: connect to Ray head
echo "Setting up worker node..."
echo "Head IP: $MASTER_ADDR"
echo "HEAD_PORT: $HEAD_PORT"
echo "NUM_GPUS: $NUM_GPUS_PER_NODE"
# Get this worker's IP address
worker_ip=$(hostname -I | awk '{print $1}')
echo "Worker IP: $worker_ip"
echo "Checking if worker $worker_ip is already in Ray cluster at $MASTER_ADDR:$HEAD_PORT"
if ray list nodes --address=$MASTER_ADDR:$HEAD_PORT 2>/dev/null | grep -q "$worker_ip"; then
echo "Worker $worker_ip already connected to Ray cluster"
ray status --address=$MASTER_ADDR:$HEAD_PORT
else
echo "Worker not connected, waiting for head node to start"
sleep 20
echo "Starting Ray worker"
ray start --address $MASTER_ADDR:$HEAD_PORT --disable-usage-stats --num-gpus=$NUM_GPUS_PER_NODE
echo "Ray start exit code: $?"
# Verify connection after starting
sleep 5
ray status --address=$MASTER_ADDR:$HEAD_PORT
fi
while ping -c 1 ${MASTER_ADDR}; do
sleep 5
done
fi
konduktor launch verl.yaml
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) [prompt] system
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) user
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Let's think step by step and output the final answer after "####".
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) assistant
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513)
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) [response] First, we calculate the number of eggs Janet's ducks lay in a day. Since she lays 16 eggs per day, the number of eggs she eats in a day is 16 - 3 - 4 = 10 eggs. This means she sells 10 eggs daily at the farmers' market. Since each egg sells for $2, the daily earnings are 10 * 2 = $20. Therefore, the amount of money Janet makes every day at the farmers' market is $20.
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513)
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) #### 20
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) #### 20
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) [ground_truth] 18
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) [score] 0.0
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) len reward_extra_infos_dict['reward']: 1319
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) local_global_step_folder: /root/checkpoints/global_step_174
(job_name=verl-multinode-training-7550 worker_id=0) (WorkerDict pid=1498, ip=172.17.1.94) INFO:2025-08-25 23:17:09,357:[Rank 0] Saved model to /root/checkpoints/global_step_174/actor/model_world_size_16_rank_0.pt
(job_name=verl-multinode-training-7550 worker_id=0) (WorkerDict pid=1499, ip=172.17.1.94) INFO:2025-08-25 23:17:09,336:[Rank 1] Saved model to /root/checkpoints/global_step_174/actor/model_world_size_16_rank_1.pt
(job_name=verl-multinode-training-7550 worker_id=0) (WorkerDict pid=24109) INFO:2025-08-25 23:17:09,614:[Rank 10] Saved optim to /root/checkpoints/global_step_174/actor/optim_world_size_16_rank_10.pt
(job_name=verl-multinode-training-7550 worker_id=0) (WorkerDict pid=24109) INFO:2025-08-25 23:17:09,615:[Rank 10] Saved extra_state to /root/checkpoints/global_step_174/actor/extra_state_world_size_16_rank_10.pt
(job_name=verl-multinode-training-7550 worker_id=0) (WorkerDict pid=1498, ip=172.17.1.94) INFO:2025-08-25 23:17:10,020:[Rank 0] Saved model config and tokenizer class to /root/checkpoints/global_step_174/actor/huggingface
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) step:174 - global_seqlen/min:1756 - global_seqlen/max:2241 - global_seqlen/minmax_diff:485 - global_seqlen/balanced_min:1974 - global_seqlen/balanced_max:1977 - global_seqlen/mean:1976.0 - actor/entropy:0.10149244964122772 - critic/vf_loss:0.051519379019737244 - critic/vf_clipfrac:0.0 - critic/vpred_mean:0.8081025183200836 - critic/grad_norm:36.863529205322266 - perf/mfu/critic:0.01831457949805057 - critic/lr:1e-05 - actor/pg_loss:-0.5662342011928558 - actor/pg_clipfrac:0.0051369862630963326 - actor/ppo_kl:-0.0011725000804290175 - actor/pg_clipfrac_lower:0.0 - actor/grad_norm:4.237640619277954 - perf/mfu/actor:0.0185783582944669 - perf/max_memory_allocated_gb:56.59689235687256 - perf/max_memory_reserved_gb:59.7578125 - perf/cpu_memory_used_gb:58.61131286621094 - actor/lr:1e-06 - val-core/openai/gsm8k/reward/mean@1:0.4890068233510235 - training/global_step:174 - training/epoch:2 - critic/score/mean:0.609375 - critic/score/max:1.0 - critic/score/min:0.0 - critic/rewards/mean:0.609375 - critic/rewards/max:1.0 - critic/rewards/min:0.0 - critic/advantages/mean:1.1653965792390863e-08 - critic/advantages/max:2.7394912242889404 - critic/advantages/min:-2.5047049522399902 - critic/returns/mean:0.5375215411186218 - critic/returns/max:1.0 - critic/returns/min:0.0 - critic/values/mean:0.73046875 - critic/values/max:1.2421875 - critic/values/min:0.01556396484375 - critic/vf_explained_var:0.3024832606315613 - response_length/mean:145.125 - response_length/max:256.0 - response_length/min:75.0 - response_length/clip_ratio:0.0546875 - response_length_non_aborted/mean:145.125 - response_length_non_aborted/max:256.0 - response_length_non_aborted/min:75.0 - response_length_non_aborted/clip_ratio:0.0546875 - response/aborted_ratio:0.0 - prompt_length/mean:101.875 - prompt_length/max:169.0 - prompt_length/min:73.0 - prompt_length/clip_ratio:0.0 - timing_s/start_profile:5.1025766879320145e-05 - timing_s/generate_sequences:0.6771697402000427 - timing_s/reshard:0.5008471012115479 - timing_s/generation_timing/max:0.7759467363357544 - timing_s/generation_timing/min:0.49384644627571106 - timing_s/generation_timing/topk_ratio:0.125 - timing_s/gen:1.8636266626417637 - timing_s/reward:0.017636850010603666 - timing_s/old_log_prob:0.2443925621919334 - timing_s/values:0.14951528888195753 - timing_s/adv:0.01702857529744506 - timing_s/update_critic:0.44504289515316486 - timing_s/update_actor:0.43918753834441304 - timing_s/step:3.178868151269853 - timing_s/testing:3.8995088669471443 - timing_s/save_checkpoint:1.738096852786839 - timing_s/stop_profile:5.9116631746292114e-05 - timing_per_token_ms/update_actor:0.01389130624824181 - timing_per_token_ms/adv:0.0005386062530821439 - timing_per_token_ms/update_critic:0.014076508576453848 - timing_per_token_ms/gen:0.10032443274341966 - timing_per_token_ms/values:0.004729102001580134 - perf/total_num_tokens:31616 - perf/time_per_step:3.178868151269853 - perf/throughput:621.6048939339158
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) ("Final validation metrics: {'val-core/openai/gsm8k/reward/mean@1': "
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513) '0.4890068233510235}')
(job_name=verl-multinode-training-7550 worker_id=0) (TaskRunner pid=14513)
(job_name=verl-multinode-training-7550 worker_id=0) Training Progress: 100%|██████████| 174/174 [14:01<00:00, 5.33s/it]