-
Notifications
You must be signed in to change notification settings - Fork 59
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
I am running the following command:
python examples/offline_inference.py --model BCCard/Qwen3-Coder-480B-A35B-Instruct-FP8-Dynamic --tensor-parallel-size 8 --kv-cache-dtype=fp8
on latest vLLM (c756fb678184b867ed94e5613a529198f1aee423) and TPU Inference (f72797fb2e112291a229468501033fa1381fc79b)
But seeing the following error:
EngineCore_DP0 pid=3182777) During handling of the above exception, another exception occurred:
(EngineCore_DP0 pid=3182777)
(EngineCore_DP0 pid=3182777) Traceback (most recent call last):
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/anaconda3/envs/ullm/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=3182777) self.run()
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/anaconda3/envs/ullm/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=3182777) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 871, in run_engine_core
(EngineCore_DP0 pid=3182777) raise e
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 858, in run_engine_core
(EngineCore_DP0 pid=3182777) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 634, in __init__
(EngineCore_DP0 pid=3182777) super().__init__(
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/engine/core.py", line 102, in __init__
(EngineCore_DP0 pid=3182777) self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=3182777) self._init_executor()
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor
(EngineCore_DP0 pid=3182777) self.driver_worker.load_model()
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/worker/tpu_worker.py", line 377, in load_model
(EngineCore_DP0 pid=3182777) self.model_runner.load_model()
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/runner/tpu_runner.py", line 487, in load_model
(EngineCore_DP0 pid=3182777) self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/common/model_loader.py", line 358, in get_model
(EngineCore_DP0 pid=3182777) return get_vllm_model(vllm_config, rng, mesh)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/common/model_loader.py", line 327, in get_vllm_model
(EngineCore_DP0 pid=3182777) params, lora_manager = model.load_weights()
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 132, in load_weights
(EngineCore_DP0 pid=3182777) vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/__init__.py", line 132, in get_model
(EngineCore_DP0 pid=3182777) return loader.load_model(vllm_config=vllm_config, model_config=model_config)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/base_loader.py", line 49, in load_model
(EngineCore_DP0 pid=3182777) model = initialize_model(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model
(EngineCore_DP0 pid=3182777) return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 659, in __init__
(EngineCore_DP0 pid=3182777) self.model = Qwen3MoeModel(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/compilation/decorators.py", line 291, in __init__
(EngineCore_DP0 pid=3182777) old_init(self, **kwargs)
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 413, in __init__
(EngineCore_DP0 pid=3182777) self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/utils.py", line 606, in make_layers
(EngineCore_DP0 pid=3182777) maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 415, in <lambda>
(EngineCore_DP0 pid=3182777) lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 353, in __init__
(EngineCore_DP0 pid=3182777) self.mlp = Qwen3MoeSparseMoeBlock(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/models/qwen3_moe.py", line 163, in __init__
(EngineCore_DP0 pid=3182777) self.experts = FusedMoE(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 594, in __init__
(EngineCore_DP0 pid=3182777) self.quant_method: FusedMoEMethodBase = _get_quant_method()
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 586, in _get_quant_method
(EngineCore_DP0 pid=3182777) quant_method = self.quant_config.get_quant_method(self, prefix)
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 116, in get_quant_method
(EngineCore_DP0 pid=3182777) return VllmCompressedTensorsW8A8Fp8MoEMethod(
(EngineCore_DP0 pid=3182777) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3182777) File "/mnt/disks/jacobplatin/tpu-inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 32, in __init__
(EngineCore_DP0 pid=3182777) super().__init__(quant_config, moe)
(EngineCore_DP0 pid=3182777) TypeError: CompressedTensorsW8A8Fp8MoEMethod.__init__() missing 1 required positional argument: 'moe
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working