Train with MosaicML composer
name: composer-train
num_nodes: 4
resources:
cpus: 124
memory: 1400
accelerators: H200:8
image_id: nvcr.io/nvidia/pytorch:24.10-py3
labels:
kueue.x-k8s.io/queue-name: user-queue
maxRunDurationSeconds: "3200"
run: |
git clone https://github.com/mosaicml/llm-foundry.git
cd llm-foundry/scripts
# Convert C4 dataset to StreamingDataset format
python data_prep/convert_dataset_hf.py \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
composer -n ${NUM_GPUS_PER_NODE} \
--world_size $(($NUM_NODES * $NUM_GPUS_PER_NODE)) \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port 8080 \
train/train.py \
train/yamls/pretrain/mpt-125m.yaml \
variables.data_local=my-copy-c4 \
train_loader.dataset.split=train_small \
eval_loader.dataset.split=val_small \
max_duration=10ba \
eval_interval=0 \
save_folder=mpt-125m