feat: support isolated MTP auxiliary loss

Victarry · liuzhenhai93 · Victarry · commit 9f51e9344c43 · 2026-06-01T14:19:33.000+08:00
Co-authored-by: liuzhenhai93 &lt;liuzhenhai93@outlook.com&gt;
diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from contextlib import nullcontext
 from typing import Optional
@@ -325,6 +325,7 @@ def __init__(
         runtime_gather_output: Optional[bool] = None,
         loss_mask: Optional[Tensor] = None,
         padding_mask=None,
+        return_logits: bool = False,
     ):
         """Initialize the schedule plan of all Transformer layers' sub-modules.
 
@@ -342,6 +343,8 @@ def __init__(
             extra_block_kwargs: Additional keyword arguments for blocks.
             runtime_gather_output: Whether to gather output at runtime.
             loss_mask (torch.Tensor): Used to mask out some portions of the loss
+            return_logits (bool): Return logits instead of main LM loss when labels
+                are provided. MTP auxiliary loss still consumes labels.
 
         Returns:
             The model chunk schedule plan.
@@ -365,6 +368,7 @@ def __init__(
         self._model_chunk_state.loss_mask = loss_mask
         self._model_chunk_state.packed_seq_params = packed_seq_params
         self._model_chunk_state.padding_mask = padding_mask
+        self._model_chunk_state.return_logits = return_logits
         self._model_chunk_state.extra_block_kwargs = extra_block_kwargs
         self._model_chunk_state.runtime_gather_output = runtime_gather_output
         self._model_chunk_state.model = model
diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import weakref
 from contextlib import nullcontext
@@ -228,6 +228,7 @@ def forward_impl(self, hidden_states):
             sequence_len_offset=self.chunk_state.sequence_len_offset,
             runtime_gather_output=self.chunk_state.runtime_gather_output,
             extra_block_kwargs=self.chunk_state.extra_block_kwargs,
+            return_logits=self.chunk_state.return_logits,
         )
 
         # For now, 1f1b only supports fp16 module
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from collections import OrderedDict
 from typing import Dict, Literal, Optional
@@ -503,6 +503,7 @@ def forward(
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
         padding_mask: Optional[Tensor] = None,
+        return_logits: bool = False,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -516,6 +517,9 @@ def forward(
             padding_mask (Tensor, optional): Padding mask for MoE routing.
                 Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
                 Only used for MoE layers to exclude padding tokens from routing computations.
+            return_logits (bool): If True, return logits even when `labels` are provided.
+                This lets online RL pass sampled labels for MTP auxiliary loss while
+                computing the main RL loss externally from logits.
         """
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
@@ -591,6 +595,7 @@ def forward(
             extra_block_kwargs=extra_block_kwargs,
             inference_context=inference_context,
             mhc_multistream=mhc_multistream,
+            return_logits=return_logits,
         )
 
     def _postprocess(
@@ -613,6 +618,7 @@ def _postprocess(
         extra_block_kwargs=None,
         inference_context=None,
         mhc_multistream=None,
+        return_logits=False,
     ):
         """Postprocesses decoder hidden states to generate logits or compute loss.
 
@@ -699,7 +705,8 @@ def _postprocess(
                 reshaped = hidden_states.squeeze(1).unsqueeze(0)
                 hidden_states = inference_context.last_token_logits(reshaped).unsqueeze(1)
 
-        if has_config_logger_enabled(self.config) or labels is None:
+        should_return_logits = return_logits or labels is None
+        if has_config_logger_enabled(self.config) or should_return_logits:
             logits, _ = self.output_layer(
                 hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
             )
@@ -730,7 +737,9 @@ def _postprocess(
             )
             log_config_to_disk(self.config, payload, prefix='input_and_logits')
 
-        if labels is None:
+        if should_return_logits:
+            # `return_logits` only changes the main LM output contract. MTP auxiliary
+            # loss above still consumes `labels`/`loss_mask` when they are provided.
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
@@ -763,6 +772,7 @@ def build_schedule_plan(
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
         padding_mask: Optional[Tensor] = None,
+        return_logits: bool = False,
     ):
         """Builds a computation schedule plan for the model.
 
@@ -789,6 +799,8 @@ def build_schedule_plan(
                 Parameters for inference. Defaults to None.
             loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None.
             padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None.
+            return_logits (bool, optional): Return logits instead of main LM loss when labels
+                are provided. MTP auxiliary loss still uses labels. Defaults to False.
 
         Returns:
             TransformerModelChunkSchedulePlan: The model chunk schedule plan.
@@ -813,6 +825,7 @@ def build_schedule_plan(
             runtime_gather_output,
             loss_mask,
             padding_mask,
+            return_logits,
         )
 
     def sharded_state_dict(
diff --git a/megatron/core/models/hybrid/hybrid_model.py b/megatron/core/models/hybrid/hybrid_model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import logging
 from typing import Literal, Optional
@@ -403,12 +403,15 @@ def forward(
         loss_mask: Optional[Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
         padding_mask: Optional[Tensor] = None,
+        return_logits: bool = False,
     ) -> Tensor:
         """Forward function of the Hybrid model. This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
         processing layer (optional).
 
         It either returns the Loss values if labels are given or the final hidden units
+        unless `return_logits` is True. In that case, labels still drive MTP auxiliary
+        loss, while the main LM head returns logits for an external loss.
         """
         # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
         # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
@@ -572,7 +575,9 @@ def forward(
             )
             self.output_layer.sequence_parallel = True
 
-        if labels is None:
+        if return_logits or labels is None:
+            # `return_logits` only controls the main LM output. Labels, when present,
+            # have already been consumed by MTP auxiliary loss above.
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 from __future__ import annotations
 
 import warnings
@@ -640,7 +640,7 @@ def set_loss_scale(scale: torch.Tensor):
 
 def process_mtp_loss(
     hidden_states: Tensor,
-    labels: Tensor,
+    labels: Optional[Tensor],
     loss_mask: Optional[Tensor],
     output_layer: Callable,
     output_weight: Optional[Tensor],
@@ -685,6 +685,23 @@ def process_mtp_loss(
     if loss_mask is None:
         loss_mask = torch.ones_like(mtp_labels)
 
+    output_weight_for_mtp = output_weight
+    output_layer_for_mtp = output_layer
+    if config.mtp_isolated_loss:
+        if output_weight_for_mtp is not None:
+            output_weight_for_mtp = output_weight_for_mtp.detach()
+        if isinstance(output_layer, torch.nn.Module):
+            output_layer_params = {
+                name: param.detach() for name, param in output_layer.named_parameters()
+            }
+            output_layer_buffers = dict(output_layer.named_buffers())
+            output_layer_state = {**output_layer_params, **output_layer_buffers}
+
+            def output_layer_for_mtp(input_: Tensor, **kwargs):
+                return torch.func.functional_call(
+                    output_layer, output_layer_state, args=(input_,), kwargs=kwargs
+                )
+
     # Store the original number of tokens before rolling for proper normalization
     # when calculate_per_token_loss is enabled. This ensures MTP gradients are
     # correctly scaled relative to the main loss gradients in finalize_model_grads.
@@ -701,17 +718,17 @@ def process_mtp_loss(
             loss_mask, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params
         )
         if fuse_linear_cross_entropy:
-            mtp_loss = output_layer(
+            mtp_loss = output_layer_for_mtp(
                 hidden_states_list[mtp_layer_number + 1],
-                weight=output_weight,
+                weight=output_weight_for_mtp,
                 runtime_gather_output=runtime_gather_output,
                 output_cross_entropy_loss=True,
                 labels=mtp_labels,
             )
         else:
-            mtp_logits, _ = output_layer(
+            mtp_logits, _ = output_layer_for_mtp(
                 hidden_states_list[mtp_layer_number + 1],
-                weight=output_weight,
+                weight=output_weight_for_mtp,
                 runtime_gather_output=runtime_gather_output,
             )
             if scale_logits_fn is not None:
@@ -991,6 +1008,8 @@ def _get_embeddings(
         )
         # embedding
         decoder_input = embedding(input_ids=input_ids, position_ids=position_ids)
+        if self.config.mtp_isolated_loss:
+            decoder_input = decoder_input.detach()
 
         hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
 
@@ -1724,6 +1743,11 @@ def forward(
             hidden_states = mhc_chunks[offset]
         else:
             hidden_states = hidden_states_list[offset]
+        if self.config.mtp_isolated_loss:
+            hidden_states = hidden_states.detach().requires_grad_(True)
+            hidden_states = make_viewless_tensor(
+                inp=hidden_states, requires_grad=True, keep_graph=False
+            )
         for iteration in range(self.config.mtp_num_layers):
             layer_idx = 0 if self.mtp_use_repeated_layer else iteration
             (hidden_states, input_ids, position_ids) = self.layers[layer_idx](
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
@@ -81,6 +81,10 @@ class TransformerConfig(ModelParallelConfig):
     which serves as an additional training objective.
     """
 
+    mtp_isolated_loss: bool = False
+    """If True, MTP loss only updates MTP module parameters. The MTP loss graph is
+    detached from the main decoder, shared embeddings, and output layer weights."""
+
     mtp_use_repeated_layer: bool = False
     """Use a single MTP layer repeatedly instead of multiple separate layers."""
 
diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py