Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions megatron/core/transformer/transformer_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,10 @@ class TransformerConfig(ModelParallelConfig):
By default, softmax is done after top-k."""

moe_router_topk_scaling_factor: Optional[float] = None
"""Scaling factor for routing score in top-k selection, only works when moe_router_pre_softmax
enabled. Defaults to None, which means no scaling."""
"""Scaling factor applied to the routing probabilities after the top-k selection.
The scaling is applied unconditionally whenever this value is not None, for every
`moe_router_score_function` (softmax/sigmoid/sqrtsoftplus) and regardless of whether
`moe_router_pre_softmax` is enabled. Defaults to None, which means no scaling."""

moe_router_score_function: Literal['softmax', 'sigmoid', 'sqrtsoftplus'] = "softmax"
"""Score function for MoE routing. Can be "softmax", "sigmoid" or "sqrtsoftplus"."""
Expand Down
47 changes: 46 additions & 1 deletion tests/unit_tests/transformer/moe/test_routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_submodules
from megatron.core.transformer.moe.moe_layer import MoELayer
from megatron.core.transformer.moe.moe_utils import get_updated_expert_bias, router_gating_linear
from megatron.core.transformer.moe.moe_utils import (
get_updated_expert_bias,
router_gating_linear,
topk_routing_with_score_function,
)
from megatron.core.transformer.moe.router import Router
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.training.initialize import _set_random_seed
Expand Down Expand Up @@ -563,3 +567,44 @@ def test_router_gating_linear_bias(router_dtype):
assert torch.allclose(inp.grad, ref_inp.grad, **tols)
assert torch.allclose(weight.grad, ref_weight.grad, **tols)
assert torch.allclose(bias.grad, ref_bias.grad, **tols)


@pytest.mark.internal
@pytest.mark.parametrize("use_pre_softmax", [True, False])
@pytest.mark.parametrize("score_function", ["softmax", "sigmoid"])
def test_topk_scaling_factor_applies_for_all_pre_softmax_settings(use_pre_softmax, score_function):
"""`moe_router_topk_scaling_factor` should multiply the post-top-k probabilities for
every combination of `score_function` and `use_pre_softmax`.

Pins the documented behavior of `moe_router_topk_scaling_factor` (issue #1875): the
scaling is applied unconditionally inside `topk_routing_with_score_function`, not
only when `moe_router_pre_softmax=True`.
"""
torch.manual_seed(0)
logits = torch.randn(8, 4)
scaling = 2.5

probs_unscaled, map_unscaled = topk_routing_with_score_function(
logits,
topk=2,
use_pre_softmax=use_pre_softmax,
scaling_factor=None,
score_function=score_function,
)
probs_scaled, map_scaled = topk_routing_with_score_function(
logits,
topk=2,
use_pre_softmax=use_pre_softmax,
scaling_factor=scaling,
score_function=score_function,
)

# The selected experts must be identical: scaling is monotonic and only changes
# magnitude of probabilities, not which experts win the top-k selection.
assert torch.equal(map_scaled, map_unscaled)
selected = map_unscaled
assert selected.any(), "Sanity: at least one expert should be selected"

torch.testing.assert_close(
probs_scaled[selected], probs_unscaled[selected] * scaling, rtol=1e-6, atol=1e-6
)
Loading