vllm-project
diff --git a/‎tests/kernels/fused_moe_v1_test.py‎
Lines changed: 63 additions & 6 deletions b/‎tests/kernels/fused_moe_v1_test.py‎
Lines changed: 63 additions & 6 deletions
diff --git a/‎tests/layers/vllm/test_mxfp4.py‎
Lines changed: 88 additions & 3 deletions b/‎tests/layers/vllm/test_mxfp4.py‎
Lines changed: 88 additions & 3 deletions
diff --git a/‎tests/layers/vllm/test_unquantized.py‎
Lines changed: 14 additions & 11 deletions b/‎tests/layers/vllm/test_unquantized.py‎
Lines changed: 14 additions & 11 deletions
@@ -10,6 +10,15 @@
 jax.config.parse_flags_with_absl()
 
 
+def cdiv(a, b):
+    assert b != 0
+    return (a + b - 1) // b
+
+
+def align_to(x, a):
+    return cdiv(x, a) * a
+
+
 def gen_moe_inputs(
     dtype,
     top_k,
@@ -19,33 +28,49 @@ def gen_moe_inputs(
     num_tokens,
     *,
     seed=1234,
+    has_bias=False,
 ):
     key = jax.random.key(seed)
-    k0, k1, k2, k4, k5 = jax.random.split(key, 5)
+    k0, k1, k2, k3, k4, k5, k6 = jax.random.split(key, 7)
+
     a = jax.random.normal(k0, (num_tokens, hidden_size),
                           dtype=jnp.float32).astype(dtype) / 10
+
     w1 = (jax.random.normal(
         k1,
         (num_experts, 2, hidden_size, intermediate_size),
         dtype=jnp.float32,
     ) / 10).astype(dtype)
     w2 = (jax.random.normal(k2, (num_experts, intermediate_size, hidden_size),
                             dtype=jnp.float32) / 10).astype(dtype)
+
+    if has_bias:
+        b1 = (jax.random.normal(k3, (num_experts, 2, intermediate_size),
+                                dtype=jnp.float32) / 10).astype(dtype)
+        b2 = (jax.random.normal(k4, (num_experts, hidden_size),
+                                dtype=jnp.float32) / 10).astype(dtype)
+    else:
+        b1 = b2 = None
+
     gating_output = (
-        jax.random.normal(k4, (num_tokens, num_experts), dtype=jnp.float32) +
+        jax.random.normal(k5, (num_tokens, num_experts), dtype=jnp.float32) +
         jnp.arange(num_tokens * num_experts, dtype=jnp.float32).reshape(
             num_tokens, num_experts) / 100)
+
     # To generate unique top-k!
-    top_k_indices = jax.random.randint(k5, (num_tokens, top_k),
+    top_k_indices = jax.random.randint(k6, (num_tokens, top_k),
                                        minval=0,
                                        maxval=num_experts - 1,
                                        dtype=jnp.int32)
+
     one_hot = (jnp.sum(
         jax.nn.one_hot(top_k_indices, num_experts, dtype=jnp.float32),
         axis=1,
     ) * 30)
+
     gating_output = (gating_output + one_hot).astype(dtype)
-    return a, w1, w2, gating_output
+
+    return a, w1, w2, b1, b2, gating_output
 
 
 def sub_channel_quantize(x, quant_dtype, wsz=256):
@@ -104,18 +129,19 @@ def _test_moe(
         act_fn="silu",
         w_dtype=None,
         subc_quant_wsz=None,
-        use_benchmark_baseline=False,
+        has_bias=False,
         atol=2e-1,
         rtol=2e-1,
     ):
-        a, w1, w2, gating_output = gen_moe_inputs(
+        a, w1, w2, b1, b2, gating_output = gen_moe_inputs(
             dtype,
             top_k,
             num_experts,
             hidden_size,
             intermediate_size,
             num_tokens,
             seed=seed,
+            has_bias=has_bias,
         )
         w1_scale = None
         w2_scale = None
@@ -137,6 +163,8 @@ def _test_moe(
             subc_quant_wsz=subc_quant_wsz,
             w1_scale=w1_scale,
             w2_scale=w2_scale,
+            b1=b1,
+            b2=b2,
             bt=bt,
             bf=bf,
             bd1=bd1,
@@ -152,6 +180,8 @@ def _test_moe(
             w2,
             gating_output,
             top_k,
+            b1=b1,
+            b2=b2,
             renormalize_topk_logits=renormalize_topk_logits,
             activation=act_fn,
             subc_quant_wsz=subc_quant_wsz,
@@ -312,6 +342,33 @@ def test_sub_channel_quantization(self, w_dtype):
             bd2c=256,
         )
 
+    def test_bias(self):
+        dtype = jnp.bfloat16
+        top_k = 8
+        num_experts = 128
+        hidden_size = 1024
+        intermediate_size = 1024
+        num_tokens = 8 * 32
+        self._test_moe(
+            dtype=dtype,
+            top_k=top_k,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_tokens=num_tokens,
+            seed=1234,
+            renormalize_topk_logits=False,
+            has_bias=True,
+            bt=32,
+            bf=512,
+            bd1=512,
+            bd2=512,
+            btc=32,
+            bfc=256,
+            bd1c=256,
+            bd2c=256,
+        )
+
 
 if __name__ == "__main__":
     absltest.main(testLoader=jtu.JaxTestLoader())
@@ -10,7 +10,7 @@
 from jax.sharding import NamedSharding, PartitionSpec
 from torchax.interop import torch_view
 from torchax.ops.mappings import j2t, t2j
-from vllm.config import set_current_vllm_config
+from vllm.config import ParallelConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
                                              init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
@@ -114,8 +114,8 @@ def test_quant_override(model, mesh):
 @pytest.mark.parametrize("hidden_size", [128])
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("topk", [2])
-def test_fused_moe_bias(mesh, num_tokens, intermediate_size, hidden_size,
-                        num_experts, topk):
+def test_mxfp4_fused_moe(mesh, num_tokens, intermediate_size, hidden_size,
+                         num_experts, topk):
     torch.manual_seed(42)
     dtype = torch.bfloat16
 
@@ -192,3 +192,88 @@ def test_fused_moe_bias(mesh, num_tokens, intermediate_size, hidden_size,
                                    atol=0.1)
 
         vllm_fused_moe(jax_a, score)
+
+
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("num_tokens", [8])
+@pytest.mark.parametrize("intermediate_size", [1024])
+@pytest.mark.parametrize("hidden_size", [128])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("topk", [2])
+def test_mxfp4_fused_moe_use_kernel(mesh, num_tokens, intermediate_size,
+                                    hidden_size, num_experts, topk):
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+
+    a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
+    w1 = torch.randn(
+        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
+    w2 = torch.randn(
+        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+    w1_weight, w1_weight_scale = quantize_to_mxfp4(w1)
+    w2_weight, w2_weight_scale = quantize_to_mxfp4(w2)
+
+    w1_bias = torch.randn(
+        (num_experts, 2 * intermediate_size), dtype=dtype) / 10
+    w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
+    score = torch.randn((num_tokens, num_experts), dtype=dtype)
+
+    engine_args = EngineArgs(
+        model=MODELS[0],
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+        load_format='dummy',
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = dtype
+    vllm_config.parallel_config = ParallelConfig(
+        tensor_parallel_size=mesh.devices.size, enable_expert_paralle=True)
+
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        vllm_fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=topk,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            renormalize=False,
+            tp_size=1,
+            dp_size=1,
+            quant_config=quant_config,
+            has_bias=True,
+        )
+    vllm_fused_moe.w13_weight.data = w1_weight
+    vllm_fused_moe.w2_weight.data = w2_weight
+    vllm_fused_moe.w13_weight_scale.data = w1_weight_scale
+    vllm_fused_moe.w2_weight_scale.data = w2_weight_scale
+    vllm_fused_moe.w13_bias.data = w1_bias
+    vllm_fused_moe.w2_bias.data = w2_bias
+
+    with torchax.default_env(), set_forward_context(None, vllm_config):
+        assert isinstance(vllm_fused_moe.quant_method, VllmMxfp4MoEMethod)
+
+        jax_a = a.to('jax')
+        jax_a.apply_jax_(jax.device_put, NamedSharding(mesh, P(None, None)))
+        score = torch_view(t2j(score))
+        score.apply_jax_(jax.device_put, NamedSharding(mesh, P(None, None)))
+
+        vllm_fused_moe.quant_method.use_kernel = True
+        vllm_fused_moe.quant_method.process_weights_after_loading(
+            vllm_fused_moe)
+        vllm_fused_moe.quant_method.block_size = {
+            "bt": 32,
+            "bf": 512,
+            "bd1": 512,
+            "bd2": 512,
+            "btc": 32,
+            "bfc": 256,
+            "bd1c": 256,
+            "bd2c": 256,
+        }
+
+        vllm_fused_moe(jax_a, score)
@@ -1,4 +1,3 @@
-import os
 import tempfile
 
 import jax
@@ -416,7 +415,6 @@ def test_merged_column_parallel_linear(model, bias, mesh, fuse_matmuls,
 @pytest.mark.parametrize("topk", [2])
 def test_fused_moe(use_ep, mesh, num_tokens, intermediate_size, hidden_size,
                    num_experts, topk):
-    os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
     torch.manual_seed(42)
     dtype = torch.bfloat16
 
@@ -496,7 +494,6 @@ def test_fused_moe(use_ep, mesh, num_tokens, intermediate_size, hidden_size,
 @pytest.mark.parametrize("topk", [2])
 def test_fused_moe_bias(mesh, num_tokens, intermediate_size, hidden_size,
                         num_experts, topk):
-    os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
     torch.manual_seed(42)
     dtype = torch.bfloat16
 
@@ -563,7 +560,6 @@ def test_fused_moe_bias(mesh, num_tokens, intermediate_size, hidden_size,
 @pytest.mark.parametrize("activation", ["silu", "swigluoai"])
 def test_fused_moe_activation(mesh, num_tokens, intermediate_size, hidden_size,
                               num_experts, topk, activation):
-    os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
     torch.manual_seed(42)
     dtype = torch.bfloat16
 
@@ -613,21 +609,20 @@ def test_fused_moe_activation(mesh, num_tokens, intermediate_size, hidden_size,
         vllm_fused_moe(jax_a, score)
 
 
-@pytest.mark.parametrize("use_ep", [True])
 @pytest.mark.parametrize("mesh",
                          [test_utils.get_spmd_mesh(jax.local_device_count())])
 @pytest.mark.parametrize("num_tokens", [128, 512])
 @pytest.mark.parametrize("intermediate_size", [256, 512])
 @pytest.mark.parametrize("hidden_size", [256])
 @pytest.mark.parametrize("num_experts", [32])
-@pytest.mark.parametrize("topk", [2])
-def test_fused_moe_use_kernel(use_ep, mesh, num_tokens, intermediate_size,
-                              hidden_size, num_experts, topk):
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("has_bias", [False, True])
+def test_fused_moe_use_kernel(mesh, num_tokens, intermediate_size, hidden_size,
+                              num_experts, topk, has_bias):
 
     if jax.local_device_count() < 8:
         pytest.skip("Test requires at least 8 devices")
 
-    os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
     torch.manual_seed(42)
     dtype = torch.bfloat16
 
@@ -636,6 +631,10 @@ def test_fused_moe_use_kernel(use_ep, mesh, num_tokens, intermediate_size,
         (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
     w2 = torch.randn(
         (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+    if has_bias:
+        b1 = torch.randn(
+            (num_experts, 2 * intermediate_size), dtype=dtype) / 10
+        b2 = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
 
     # Use deterministic gating_output generation (same logic as fused_moe_v1_test.py)
     # Generate base gating scores with deterministic pattern
@@ -679,7 +678,7 @@ def test_fused_moe_use_kernel(use_ep, mesh, num_tokens, intermediate_size,
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = dtype
     vllm_config.parallel_config = ParallelConfig(
-        tensor_parallel_size=mesh.devices.size, enable_expert_paralle=use_ep)
+        tensor_parallel_size=mesh.devices.size, enable_expert_paralle=True)
 
     quant_config = get_tpu_quantization_config(vllm_config, mesh)
     with set_current_vllm_config(vllm_config):
@@ -693,11 +692,15 @@ def test_fused_moe_use_kernel(use_ep, mesh, num_tokens, intermediate_size,
             tp_size=mesh.devices.size,
             dp_size=1,
             quant_config=quant_config,
+            has_bias=has_bias,
         )
-        vllm_fused_moe.moe_parallel_config.use_ep = use_ep
+        vllm_fused_moe.moe_parallel_config.use_ep = True
 
     vllm_fused_moe.w13_weight.data = w1
     vllm_fused_moe.w2_weight.data = w2
+    if has_bias:
+        vllm_fused_moe.w13_bias.data = b1
+        vllm_fused_moe.w2_bias.data = b2
 
     p_spec = P('model', )
     jax_a = torch_view(t2j(a, use_dlpack=False))