NVIDIA · edenfunf · Apr 25, 2026 · Apr 27, 2026
@@ -692,8 +692,10 @@ class TransformerConfig(ModelParallelConfig):
     By default, softmax is done after top-k."""
 
     moe_router_topk_scaling_factor: Optional[float] = None
-    """Scaling factor for routing score in top-k selection, only works when moe_router_pre_softmax
-    enabled. Defaults to None, which means no scaling."""
+    """Scaling factor applied to the routing probabilities after the top-k selection.
+    The scaling is applied unconditionally whenever this value is not None, for every
+    `moe_router_score_function` (softmax/sigmoid/sqrtsoftplus) and regardless of whether
+    `moe_router_pre_softmax` is enabled. Defaults to None, which means no scaling."""
 
     moe_router_score_function: Literal['softmax', 'sigmoid', 'sqrtsoftplus'] = "softmax"
     """Score function for MoE routing. Can be "softmax", "sigmoid" or "sqrtsoftplus"."""

diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py
@@ -8,7 +8,11 @@
 
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_submodules
 from megatron.core.transformer.moe.moe_layer import MoELayer
-from megatron.core.transformer.moe.moe_utils import get_updated_expert_bias, router_gating_linear
+from megatron.core.transformer.moe.moe_utils import (
+    get_updated_expert_bias,
+    router_gating_linear,
+    topk_routing_with_score_function,
+)
 from megatron.core.transformer.moe.router import Router
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.training.initialize import _set_random_seed
@@ -563,3 +567,44 @@ def test_router_gating_linear_bias(router_dtype):
     assert torch.allclose(inp.grad, ref_inp.grad, **tols)
     assert torch.allclose(weight.grad, ref_weight.grad, **tols)
     assert torch.allclose(bias.grad, ref_bias.grad, **tols)
+
+
+@pytest.mark.internal
+@pytest.mark.parametrize("use_pre_softmax", [True, False])
+@pytest.mark.parametrize("score_function", ["softmax", "sigmoid"])
+def test_topk_scaling_factor_applies_for_all_pre_softmax_settings(use_pre_softmax, score_function):
+    """`moe_router_topk_scaling_factor` should multiply the post-top-k probabilities for
+    every combination of `score_function` and `use_pre_softmax`.
+
+    Pins the documented behavior of `moe_router_topk_scaling_factor` (issue #1875): the
+    scaling is applied unconditionally inside `topk_routing_with_score_function`, not
+    only when `moe_router_pre_softmax=True`.
+    """
+    torch.manual_seed(0)
+    logits = torch.randn(8, 4)
+    scaling = 2.5
+
+    probs_unscaled, map_unscaled = topk_routing_with_score_function(
+        logits,
+        topk=2,
+        use_pre_softmax=use_pre_softmax,
+        scaling_factor=None,
+        score_function=score_function,
+    )
+    probs_scaled, map_scaled = topk_routing_with_score_function(
+        logits,
+        topk=2,
+        use_pre_softmax=use_pre_softmax,
+        scaling_factor=scaling,
+        score_function=score_function,
+    )
+
+    # The selected experts must be identical: scaling is monotonic and only changes
+    # magnitude of probabilities, not which experts win the top-k selection.
+    assert torch.equal(map_scaled, map_unscaled)
+    selected = map_unscaled
+    assert selected.any(), "Sanity: at least one expert should be selected"
+
+    torch.testing.assert_close(
+        probs_scaled[selected], probs_unscaled[selected] * scaling, rtol=1e-6, atol=1e-6
+    )