When training Qwen3-VL-30B-A3B-Instruct across two machines (16 A100 GPUs total) with pipeline parallelism (pp=1), an error occurs.
error is here:
ray.exceptions.RayTaskError(DistBackendError): �[36mray::WorkerDict.actor_rollout_init_model()�[39m (pid=83613, ip=10.178.141.141, actor_id=1b6ca82c8bf5c6fdf41fab710f000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fa2ee0f69f0>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/single_controller/ray/base.py", line 700, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/single_controller/base/decorator.py", line 442, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/utils/transferqueue_utils.py", line 199, in dummy_inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/workers/megatron_workers.py", line 477, in init_model
) = self._build_model_optimizer(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/workers/megatron_workers.py", line 318, in _build_model_optimizer
actor_module = make_megatron_module(
^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-11-12_12-48-12_225739_33645/runtime_resources/working_dir_files/_ray_pkg_7cebd7d28058c485/verl/utils/megatron_utils.py", line 192, in make_megatron_module
return bridge.get_model(
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/mbridge/core/bridge.py", line 120, in get_model
model = get_model(
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/mbridge/core/util.py", line 243, in get_model
model_module.broadcast_params()
File "/usr/local/lib/python3.12/dist-packages/megatron/core/distributed/distributed_data_parallel.py", line 630, in broadcast_params
torch.distributed.broadcast(
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 2824, in broadcast
work = group.broadcast([tensor], opts)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3699, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.27.3
ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
Last error:
set -xeuo pipefail
pwd=`pwd`
# verlai/verl:vllm011.dev_qwenvl_cp
cd /data/dubingnan/dbn-ceph/verl-251111/verl
project_name='DAPO'
exp_name='deeprl_qwen3vl_mg_dapo_train_ep8_tp2_pp1_cp1_vllm_tp8'
ENGINE=${1:-vllm}
# Paths
MODEL_PATH=/data/dubingnan/dbn-ceph/models/huggingface/Qwen/Qwen3-VL-30B-A3B-Instruct
CKPTS_DIR=${pwd}/ckpt5/${exp_name}
train_path=/data/dubingnan/dbn-ceph/datasets/data/geo3k/train.parquet
test_path=/data/dubingnan/dbn-ceph/datasets/data/geo3k/test.parquet
ray job submit \
--runtime-env=verl/trainer/runtime_env.yaml \
--no-wait \
-- \
python3 -m recipe.dapo.main_dapo --config-path=config \
--config-name='dapo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files="$train_path" \
data.val_files="$test_path" \
data.train_batch_size=32 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.filter_overlong_prompts_workers=128 \
algorithm.filter_groups.enable=True \
algorithm.filter_groups.max_num_gen_batches=20 \
algorithm.filter_groups.metric=seq_reward \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_fused_kernels=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.megatron.expert_model_parallel_size=8 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
actor_rollout_ref.actor.megatron.context_parallel_size=1 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.01 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5120 \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=20480 \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
actor_rollout_ref.rollout.name=vllm \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.megatron.use_mbridge=True \
actor_rollout_ref.actor.megatron.param_offload=True \
actor_rollout_ref.actor.megatron.optimizer_offload=True \
actor_rollout_ref.actor.megatron.grad_offload=True \
actor_rollout_ref.ref.megatron.param_offload=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
reward_model.reward_manager=dapo \
reward_model.overlong_buffer.enable=True \
reward_model.overlong_buffer.len=1024 \
reward_model.overlong_buffer.penalty_factor=1.0 \
reward_model.overlong_buffer.log=False \
+reward_model.reward_kwargs.max_resp_len=2048 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.val_before_train=False \
trainer.logger='["console","tensorboard"]' \
trainer.project_name=${project_name} \
trainer.experiment_name=${exp_name} \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.n_gpus_per_node=8 \
trainer.nnodes=2 \
trainer.save_freq=-1 \
trainer.test_freq=2 \
trainer.max_actor_ckpt_to_keep=1 \
trainer.total_epochs=15 \
2>&1 | tee ${pwd}/log/${exp_name}_$(date +'%Y%m%d_%H%M%S').log
When training Qwen3-VL-30B-A3B-Instruct across two machines (16 A100 GPUs total) with pipeline parallelism (pp=1), an error occurs.
error is here:
Docker image: verlai/verl:vllm011.dev_qwenvl_cp
VERL commit: 0eb50ec4a33cda97e05ed8caab9c7f17a30c05a9
Below is the test script: