Skip to content

[Bug]: Qwen3-Coder failing on latest main #1280

@jrplatin

Description

@jrplatin

I am running the following command:

python examples/offline_inference.py --model BCCard/Qwen3-Coder-480B-A35B-Instruct-FP8-Dynamic --tensor-parallel-size 8 --kv-cache-dtype=fp8 

on latest vLLM (c756fb678184b867ed94e5613a529198f1aee423) and TPU Inference (f72797fb2e112291a229468501033fa1381fc79b)

But seeing the following error:

EngineCore_DP0 pid=3182777) During handling of the above exception, another exception occurred:
(EngineCore_DP0 pid=3182777) 
(EngineCore_DP0 pid=3182777) Traceback (most recent call last):
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/anaconda3/envs/ullm/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=3182777)     self.run()
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/anaconda3/envs/ullm/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=3182777)     self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 871, in run_engine_core
(EngineCore_DP0 pid=3182777)     raise e
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 858, in run_engine_core
(EngineCore_DP0 pid=3182777)     engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=3182777)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 634, in __init__
(EngineCore_DP0 pid=3182777)     super().__init__(
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 102, in __init__
(EngineCore_DP0 pid=3182777)     self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=3182777)                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=3182777)     self._init_executor()
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor
(EngineCore_DP0 pid=3182777)     self.driver_worker.load_model()
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/worker/tpu_worker.py", line 377, in load_model
(EngineCore_DP0 pid=3182777)     self.model_runner.load_model()
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/runner/tpu_runner.py", line 487, in load_model
(EngineCore_DP0 pid=3182777)     self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model(
(EngineCore_DP0 pid=3182777)                                                                                                                                       ^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/common/model_loader.py", line 358, in get_model
(EngineCore_DP0 pid=3182777)     return get_vllm_model(vllm_config, rng, mesh)
(EngineCore_DP0 pid=3182777)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/common/model_loader.py", line 327, in get_vllm_model
(EngineCore_DP0 pid=3182777)     params, lora_manager = model.load_weights()
(EngineCore_DP0 pid=3182777)                            ^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 132, in load_weights
(EngineCore_DP0 pid=3182777)     vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
(EngineCore_DP0 pid=3182777)                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/__init__.py", line 132, in get_model
(EngineCore_DP0 pid=3182777)     return loader.load_model(vllm_config=vllm_config, model_config=model_config)
(EngineCore_DP0 pid=3182777)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/base_loader.py", line 49, in load_model
(EngineCore_DP0 pid=3182777)     model = initialize_model(
(EngineCore_DP0 pid=3182777)             ^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model
(EngineCore_DP0 pid=3182777)     return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=3182777)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 659, in __init__
(EngineCore_DP0 pid=3182777)     self.model = Qwen3MoeModel(
(EngineCore_DP0 pid=3182777)                  ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/compilation/decorators.py", line 291, in __init__
(EngineCore_DP0 pid=3182777)     old_init(self, **kwargs)
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 413, in __init__
(EngineCore_DP0 pid=3182777)     self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=3182777)                                                     ^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/utils.py", line 606, in make_layers
(EngineCore_DP0 pid=3182777)     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=3182777)                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 415, in <lambda>
(EngineCore_DP0 pid=3182777)     lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
(EngineCore_DP0 pid=3182777)                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 353, in __init__
(EngineCore_DP0 pid=3182777)     self.mlp = Qwen3MoeSparseMoeBlock(
(EngineCore_DP0 pid=3182777)                ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 163, in __init__
(EngineCore_DP0 pid=3182777)     self.experts = FusedMoE(
(EngineCore_DP0 pid=3182777)                    ^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 594, in __init__
(EngineCore_DP0 pid=3182777)     self.quant_method: FusedMoEMethodBase = _get_quant_method()
(EngineCore_DP0 pid=3182777)                                             ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 586, in _get_quant_method
(EngineCore_DP0 pid=3182777)     quant_method = self.quant_config.get_quant_method(self, prefix)
(EngineCore_DP0 pid=3182777)                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 116, in get_quant_method
(EngineCore_DP0 pid=3182777)     return VllmCompressedTensorsW8A8Fp8MoEMethod(
(EngineCore_DP0 pid=3182777)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777)   File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 32, in __init__
(EngineCore_DP0 pid=3182777)     super().__init__(quant_config, moe)
(EngineCore_DP0 pid=3182777) TypeError: CompressedTensorsW8A8Fp8MoEMethod.__init__() missing 1 required positional argument: 'moe

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions