From d6d11572ba77c44d69b7093bb613f02f524ea4d5 Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Tue, 26 May 2026 21:12:00 -0700
Subject: [PATCH 1/9] feat(engine): delegate megatron live weight sync to
 bridge.export_hf_weights

Add use_bridge_for_update_weights flag that routes the live weight
update path through megatron-bridge.export_hf_weights instead of the
hand-rolled convert_to_hf registry. Required for new model families
(e.g. Qwen3.5) that don't have a registry entry.

The bridge handles TP/EP/PP gather and HF layout transformation
internally; AReaL keeps the bucketed broadcast loop unchanged. FP8
and LoRA paths fall back to the registry automatically.

Also fix a latent device-context bug in _load_model_from_hf:
megatron-bridge builds shard-index tensors via torch.arange() under
the caller's `with self.device:` context, putting them on CUDA while
HF weights are loaded on CPU. The resulting indexing error trips
ChunkedMapping for any model with GDN/Mamba conv1d weights (e.g.
Qwen3.5). Force CPU as the factory-op default just around the
bridge.load_hf_weights call.

Key changes:
- New MegatronEngineConfig.use_bridge_for_update_weights flag
- Refactor _update_weights_from_distributed into dispatch +
  _update_weights_via_registry helper
- New _update_weights_via_bridge streams from bridge.export_hf_weights
  and reuses the bucket broadcast loop
- Wrap bridge.load_hf_weights in `with torch.device("cpu"):` to
  prevent CUDA index / CPU tensor mismatch in ChunkedMapping
---
 areal/api/cli_args.py           | 10 +++++
 areal/engine/megatron_engine.py | 74 +++++++++++++++++++++++++++++++--
 docs/en/cli_reference.md        |  1 +
 docs/zh/cli_reference.md        |  1 +
 4 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
index 7c5344572..5757cbfb3 100644
--- a/areal/api/cli_args.py
+++ b/areal/api/cli_args.py
@@ -940,6 +940,16 @@ class MegatronEngineConfig:
         },
     )
 
+    use_bridge_for_update_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "When True and bridge_type='megatron-bridge', delegate live "
+            "weight sync to bridge.export_hf_weights instead of the hand-rolled "
+            "convert_to_hf registry. Required for models without a registry entry "
+            "(e.g. Qwen3.5). FP8 paths fall back to the registry automatically.",
+        },
+    )
+
 
 class SchedulingStrategyType(str, Enum):
     separation = "separation"
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
index f972fd083..3dfd4ffcf 100644
--- a/areal/engine/megatron_engine.py
+++ b/areal/engine/megatron_engine.py
@@ -1753,6 +1753,35 @@ def _update_weights_from_distributed(self, meta: WeightUpdateMeta) -> None:
 
         dist.barrier(group=self.cpu_group)
 
+        # Bridge delegation: when bridge_type=megatron-bridge and the user opts in,
+        # stream HF tensors directly from bridge.export_hf_weights. Falls back to
+        # the hand-rolled registry path for FP8 (quant_mapping in megatron-bridge
+        # is amax-style, not TE blockwise) and for LoRA (separate adapter export
+        # path not yet wired here).
+        use_bridge = (
+            self.bridge_cls == "megatron-bridge"
+            and self.mcore_config.use_bridge_for_update_weights
+            and not self.quantization_config
+            and not self.config.use_lora
+        )
+        if use_bridge:
+            self._update_weights_via_bridge(meta)
+        else:
+            self._update_weights_via_registry(meta)
+
+        if dist.get_rank() == 0:
+            self.rollout_engine.continue_generation()
+
+        current_platform.synchronize()
+        dist.barrier(group=self.cpu_group)
+
+    def _update_weights_via_registry(self, meta: WeightUpdateMeta) -> None:
+        """Hand-rolled conversion path via convert_to_hf registry.
+
+        Used for FP8, LoRA, and models with a converter entry. Iterates this PP
+        rank's local params, TP-gathers per param, converts to HF layout, and
+        bucket-broadcasts to the rollout engine.
+        """
         num_moe_experts = self.tf_config.num_moe_experts
         weight_chunked_mem_size = meta.weight_chunked_mem_mb * 1024 * 1024
 
@@ -1807,10 +1836,37 @@ def _update_weights_from_distributed(self, meta: WeightUpdateMeta) -> None:
 
         dist.barrier(group=self.cpu_group)
 
-        if dist.get_rank() == 0:
-            self.rollout_engine.continue_generation()
+    def _update_weights_via_bridge(self, meta: WeightUpdateMeta) -> None:
+        """Delegate live weight sync to megatron-bridge.export_hf_weights.
+
+        Streams (hf_name, hf_tensor) directly from the bridge, which handles
+        TP/EP/PP gather and layout transformation internally. Each PP rank
+        iterates the global parameter set (vs registry path which iterates only
+        local layers); non-PP-heads participate in collectives but do not bucket.
+        MoE expert weights are yielded inline by the bridge's grouped-export
+        path, so no separate second pass is needed.
+        """
+        weight_chunked_mem_size = meta.weight_chunked_mem_mb * 1024 * 1024
+        bucket: list[tuple[str, torch.Tensor]] = []
+        bucket_size = 0
+
+        for hf_name, hf_tensor in self.bridge.export_hf_weights(
+            self.model,
+            cpu=False,
+            show_progress=False,
+        ):
+            if not self.is_pipeline_parallel_head():
+                continue
+            size = hf_tensor.numel() * hf_tensor.element_size()
+            if bucket_size + size > weight_chunked_mem_size:
+                self._update_bucket_weights_from_distributed(meta, bucket)
+                bucket_size = 0
+            bucket.append((hf_name, hf_tensor))
+            bucket_size += size
+
+        if bucket:
+            self._update_bucket_weights_from_distributed(meta, bucket)
 
-        current_platform.synchronize()
         dist.barrier(group=self.cpu_group)
 
     @trace_perf("megatron_engine.update_weights_from_disk", category="io")
@@ -1909,7 +1965,17 @@ def _load_model_from_hf(self, path: str) -> None:
                 raise ValueError(
                     "Loading critic model is not supported with megatron-bridge."
                 )
-            self.bridge.load_hf_weights(self.model, hf_path=path)
+            # megatron-bridge's load path builds shard-index tensors via
+            # ``torch.arange(...)`` to index HF weights that live on CPU. Under
+            # the caller's ``with self.device:`` (CUDA) context, those indices
+            # become CUDA tensors and the CPU-tensor indexing raises
+            # ``RuntimeError: indices should be either on cpu or on the same
+            # device as the indexed tensor (cpu)`` — triggered by ChunkedMapping
+            # for any model with GDN/Mamba-style conv1d weights (e.g. Qwen3.5).
+            # Force CPU as the factory-op default here; tensor data assignment
+            # to GPU model params is unaffected (handled by .copy_()).
+            with torch.device("cpu"):
+                self.bridge.load_hf_weights(self.model, hf_path=path)
         else:
             load_weights_from_hf_with_mbridge_fast(
                 bridge=self.bridge,
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
index 63fad2104..73a50116c 100644
--- a/docs/en/cli_reference.md
+++ b/docs/en/cli_reference.md
@@ -1092,6 +1092,7 @@ Refer to Megatron-LM documentation for implementation details.
 | `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
 | `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
 | `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
 
 (section-memory-profiler)=
 
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
index 79cd7e754..6b9372fee 100644
--- a/docs/zh/cli_reference.md
+++ b/docs/zh/cli_reference.md
@@ -1090,6 +1090,7 @@ Refer to Megatron-LM documentation for implementation details.
 | `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
 | `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
 | `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
 
 (section-memory-profiler)=
 

From f6d9900eec0effc87967b0fbce8856e48a00a4ed Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Tue, 26 May 2026 21:12:30 -0700
Subject: [PATCH 2/9] test(megatron): add Qwen3.5 distributed test scaffolding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 1-GPU smoke + 5 multi-GPU tests (TP=2, PP=2, PP+VPP=2, DP=2
grad_norm invariance, DCP save/load) mirroring the Qwen3 dense set.
All Qwen3.5 tests route through bridge_type=megatron-bridge because
its GDN hybrid attention is only handled by megatron-bridge's model
definitions (mbridge would substring-match qwen3 and emit wrong
shapes).

NOTE: these tests currently fail at engine.forward because
megatron-core's GDN layer raises NotImplementedError on packed
(THD) sequences. A follow-up will add a BSHD path mirroring verl's
data_format switch; until then these tests document the expected
coverage and act as a regression target.

Key changes:
- Add qwen3_5 to MODEL_PATHS in run_megatron_engine_distributed.py
- Re-key MODEL_PATHS from areal.utils.testing_utils canonical paths
  so local-path overrides propagate from a single source
- Wire bridge_type via _MODEL_BRIDGE_OVERRIDES (qwen3_5 →
  megatron-bridge)
- Six test_qwen3_5_* tests in test_megatron_engine_distributed.py
---
 tests/test_megatron_engine_distributed.py     | 102 ++++++++++++++++++
 .../run_megatron_engine_distributed.py        |  28 +++--
 2 files changed, 120 insertions(+), 10 deletions(-)

diff --git a/tests/test_megatron_engine_distributed.py b/tests/test_megatron_engine_distributed.py
index 95968ff2e..06375df83 100644
--- a/tests/test_megatron_engine_distributed.py
+++ b/tests/test_megatron_engine_distributed.py
@@ -148,3 +148,105 @@ def test_qwen3moe_dcp_save_load(tmp_path_factory):
         test_type="simple_dcp_save_load",
         output=str(output),
     )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Qwen3.5 dense tests. Routed through bridge_type=megatron-bridge because
+# its GDN hybrid attention is only handled by the megatron-bridge model
+# definitions (mbridge would fall back to the qwen3 substring match and
+# emit wrong shapes). The runner sets bridge_type automatically based on
+# ``_MODEL_BRIDGE_OVERRIDES``.
+# ──────────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.slow
+def test_qwen3_5_single_gpu_forward(tmp_path_factory):
+    """Smoke test on a single GPU: engine init + forward pass.
+
+    Validates the megatron-bridge load path (including the AReaL-side
+    ``with torch.device("cpu"):`` fix for GDN ChunkedMapping) and basic
+    forward execution before exercising any parallelism.
+    """
+    if current_platform.device_count() < 1:
+        pytest.skip("requires 1 GPU to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_single_gpu.out"
+    _run_test_with_torchrun(
+        "qwen3_5", "megatron:d1p1t1", test_type="forward", output=str(output)
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_tensor_parallel(tmp_path_factory):
+    if current_platform.device_count() < 2:
+        pytest.skip("tensor parallel requires 2 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_tensor_parallel.out"
+    _run_test_with_torchrun(
+        "qwen3_5", "megatron:d1p1t2", test_type="forward", output=str(output)
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_pipeline_parallel(tmp_path_factory):
+    if current_platform.device_count() < 2:
+        pytest.skip("pipeline parallel requires 2 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_pipeline_parallel.out"
+    _run_test_with_torchrun(
+        "qwen3_5", "megatron:d1p2t1", test_type="forward", output=str(output)
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_virtual_pipeline_parallel(tmp_path_factory):
+    if current_platform.device_count() < 2:
+        pytest.skip("virtual pipeline parallel requires 2 GPUs to run")
+    output = (
+        tmp_path_factory.mktemp("test_output") / "qwen3_5_virtual_pipeline_parallel.out"
+    )
+    _run_test_with_torchrun(
+        "qwen3_5",
+        "megatron:d1p2t1",
+        test_type="forward",
+        output=str(output),
+        vpp_size=2,
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_grad_norm_mb_invariance(tmp_path_factory):
+    """Same regression guard as ``test_qwen3_grad_norm_mb_invariance`` but on
+    Qwen3.5. Exercises full backward + optimizer step under DP=2 to verify the
+    ``loss_multiplier`` fix still holds for GDN models.
+    """
+    if current_platform.device_count() < 2:
+        pytest.skip("grad_norm_mb_invariance requires 2 GPUs to run")
+    output = (
+        tmp_path_factory.mktemp("test_output") / "qwen3_5_grad_norm_mb_invariance.out"
+    )
+    _run_test_with_torchrun(
+        "qwen3_5",
+        "megatron:d2p1t1",
+        test_type="grad_norm_mb_invariance",
+        output=str(output),
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_dcp_save_load(tmp_path_factory):
+    """DCP save/load round-trip under TP=2 (sufficient to exercise cross-rank
+    save/load; the full d2p2t2 layout used for Qwen3 dense isn't needed
+    because the bridge type, not parallelism, is what's being validated).
+    """
+    if current_platform.device_count() < 2:
+        pytest.skip("Qwen3.5 DCP save load requires 2 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_save_load.out"
+    _run_test_with_torchrun(
+        "qwen3_5",
+        "megatron:d1p1t2",
+        test_type="train_dcp_save_load",
+        output=str(output),
+    )
diff --git a/tests/torchrun/run_megatron_engine_distributed.py b/tests/torchrun/run_megatron_engine_distributed.py
index 4643fe3b4..d34d27089 100644
--- a/tests/torchrun/run_megatron_engine_distributed.py
+++ b/tests/torchrun/run_megatron_engine_distributed.py
@@ -9,8 +9,6 @@
 from megatron.core import parallel_state as mpu
 from transformers import AutoTokenizer
 
-from tests.utils import get_model_path
-
 from areal.api import FinetuneSpec, SaveLoadMeta
 from areal.api.alloc_mode import ModelAllocation
 from areal.api.cli_args import (
@@ -23,16 +21,22 @@
 from areal.infra.platforms import current_platform
 from areal.utils import seeding
 from areal.utils.data import broadcast_tensor_container
+from areal.utils.testing_utils import DENSE_MODEL_PATHS, MOE_MODEL_PATHS
 
+# Re-key from testing_utils.py canonical paths so local-path overrides
+# (e.g. ``/home/nfs/models/Qwen3-0.6B``) propagate from a single source.
+# Keys here use the runner's existing convention (no underscore in ``qwen3moe``).
 MODEL_PATHS = {
-    "qwen3": get_model_path(
-        "/storage/openpsi/models/Qwen__Qwen3-0.6B/", "Qwen/Qwen3-0.6B"
-    ),
-    "qwen3moe": get_model_path(
-        "/storage/openpsi/models/Qwen__Qwen3-30B-A3B/", "Qwen/Qwen3-30B-A3B"
-    ),
+    "qwen3": DENSE_MODEL_PATHS["qwen3"],
+    "qwen3moe": MOE_MODEL_PATHS["qwen3_moe"],
+    "qwen3_5": DENSE_MODEL_PATHS["qwen3_5"],
 }
 
+# bridge_type must default to mbridge for backwards compat with existing
+# qwen3/qwen3moe tests; qwen3_5 is forced to megatron-bridge because that's
+# the only bridge that handles its GDN hybrid attention layers.
+_MODEL_BRIDGE_OVERRIDES = {"qwen3_5": "megatron-bridge"}
+
 
 def write_result(out: str, succ: bool):
     with open(out, "w") as f:
@@ -73,6 +77,7 @@ def mock_input(
 
 
 def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
+    bridge_type = _MODEL_BRIDGE_OVERRIDES.get(model_type, "mbridge")
     config = TrainEngineConfig(
         backend=backend,
         experiment_name="test",
@@ -80,7 +85,10 @@ def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
         path=MODEL_PATHS[model_type],
         mb_spec=mb_spec,
         optimizer=OptimizerConfig() if init_optimizer else None,
-        megatron=MegatronEngineConfig(virtual_pipeline_parallel_size=vpp_size),
+        megatron=MegatronEngineConfig(
+            virtual_pipeline_parallel_size=vpp_size,
+            bridge_type=bridge_type,
+        ),
     )
     alloc_mode = ModelAllocation.from_str(backend)
     ft_spec = FinetuneSpec(total_train_epochs=1, dataset_size=128, train_batch_size=8)
@@ -490,7 +498,7 @@ def main():
     parser.add_argument(
         "--model_type",
         type=str,
-        choices=["qwen3", "qwen3moe"],
+        choices=["qwen3", "qwen3moe", "qwen3_5"],
         default="qwen3",
         help="Type of model to test",
     )

From efbfaa1b5a553c16d77fc8fd0f9c9ec900fa55dd Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Tue, 26 May 2026 23:29:51 -0700
Subject: [PATCH 3/9] feat(engine): add BSHD padded forward path and
 megatron-bridge patches for Qwen3.5

Qwen3.5's GDN (Gated Delta Net) layers reject packed (THD) sequences
in megatron-core. Add a BSHD path that reconstructs [B, S] padded input
from cu_seqlens inside packed_context_parallel_forward, mirroring the
existing VLM 2D-reconstruction logic but for text-only models.

Also add runtime monkey-patch for megatron-bridge PR #3143 (MTP shadow
embedding missing word_embeddings attribute under sequence_parallel +
tied embeddings). The patch lazily restores the attribute from the
closure before _postprocess runs, avoiding the need to replace the
full forward method.

Key changes:
- New MegatronEngineConfig.use_padded_seq flag (BSHD mode)
- Generalize VLM 2D-reconstruction path in packed_context_parallel_forward
  to also fire on use_padded_seq=True for non-VLM models
- CP>1 guard for use_padded_seq (same constraint VLM has)
- New megatron_bridge_patches.py with PR #3143 workaround
- New train_hf_save_load test_type in runner (replaces DCP for SSM
  models whose flattened_range tensors are unsupported by mcore DCP)
- Qwen3.5 tests: 1-GPU, TP=2, PP=2, HF save/load all pass;
  VPP and grad_norm_mb_invariance skipped with documented reasons
---
 areal/api/cli_args.py                         |  12 ++
 areal/engine/megatron_engine.py               |  13 ++
 .../megatron_utils/megatron_bridge_patches.py |  82 ++++++++++++
 .../megatron_utils/packed_context_parallel.py |  46 ++++---
 docs/en/cli_reference.md                      |  61 ++++-----
 docs/zh/cli_reference.md                      |  61 ++++-----
 tests/test_megatron_engine_distributed.py     |  28 +++-
 .../run_megatron_engine_distributed.py        | 121 +++++++++++++++++-
 8 files changed, 340 insertions(+), 84 deletions(-)
 create mode 100644 areal/engine/megatron_utils/megatron_bridge_patches.py

diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
index 5757cbfb3..9eb6b7a69 100644
--- a/areal/api/cli_args.py
+++ b/areal/api/cli_args.py
@@ -950,6 +950,18 @@ class MegatronEngineConfig:
         },
     )
 
+    use_padded_seq: bool = field(
+        default=False,
+        metadata={
+            "help": "Force padded (BSHD) input layout instead of packed (THD) for "
+            "forward / train_batch. Required for architectures whose state-space "
+            "or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less "
+            "memory-efficient because attention computes over padding, so prefer "
+            "small per-microbatch sequence counts. Incompatible with "
+            "context_parallel_size > 1 (same constraint VLM has).",
+        },
+    )
+
 
 class SchedulingStrategyType(str, Enum):
     separation = "separation"
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
index 3dfd4ffcf..b6231c8b4 100644
--- a/areal/engine/megatron_engine.py
+++ b/areal/engine/megatron_engine.py
@@ -61,6 +61,7 @@
     is_valid_vision_model,
     lang_config,
 )
+from areal.engine.megatron_utils import megatron_bridge_patches  # noqa: F401
 from areal.engine.megatron_utils.checkpointer import MegatronCheckpointManager
 from areal.engine.megatron_utils.deterministic import set_deterministic_algorithms
 from areal.engine.megatron_utils.fp8 import FP8BlockwiseTensorHelper
@@ -361,6 +362,17 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
                     f"Loaded processor and tokenizer."
                 )
 
+            if (
+                self.mcore_config.use_padded_seq
+                and self.parallel_strategy.context_parallel_size > 1
+            ):
+                raise NotImplementedError(
+                    "Context parallel (CP > 1) is not supported with "
+                    "use_padded_seq=True (the padded BSHD path operates on "
+                    "[B, S] tensors and the CP path packs sequences). "
+                    f"Got context_parallel_size={self.parallel_strategy.context_parallel_size}."
+                )
+
             self.quantization_config = getattr(
                 self.hf_config, "quantization_config", None
             )
@@ -844,6 +856,7 @@ def forward_step(batch_iter, model):
                 mb_input.padded_mb,
                 gather_cp_output=not cp_local,
                 is_vision_model=self.is_vision_model,
+                use_padded_seq=self.mcore_config.use_padded_seq,
             )
 
             # Release tree attention metadata after forward pass
diff --git a/areal/engine/megatron_utils/megatron_bridge_patches.py b/areal/engine/megatron_utils/megatron_bridge_patches.py
new file mode 100644
index 000000000..9936429d9
--- /dev/null
+++ b/areal/engine/megatron_utils/megatron_bridge_patches.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+"""Runtime patches for megatron-bridge bugs not yet in a released version.
+
+Each patch is keyed to an upstream PR and is auto-disabled once megatron-bridge
+ships a release containing the fix. Apply patches at import time via
+``_apply_patches_on_import()`` at module bottom.
+"""
+
+from __future__ import annotations
+
+import areal.utils.logging as logging
+
+logger = logging.getLogger("MegatronBridgePatches")
+
+
+def _patch_qwen3vl_pr3143_word_embeddings() -> None:
+    """megatron-bridge PR #3143: expose word_embeddings on MTP shadow embedding.
+
+    Bug (issue #3112 / PR #3143): in ``Qwen3VLGPTModel.forward``, when
+    ``mtp_process and sequence_parallel`` are both True, ``self.embedding`` is
+    temporarily replaced with a plain closure ``_sp_scatter_embedding``. The
+    closure lacks the ``word_embeddings`` attribute that
+    ``shared_embedding_or_output_weight()`` accesses during ``_postprocess``
+    when ``share_embeddings_and_output_weights=True`` — typical for the
+    smaller Qwen3.5 dense models (0.8B/2B/4B).
+
+    Failure mode:
+        ``AttributeError: 'function' object has no attribute 'word_embeddings'``
+
+    Affected versions: megatron-bridge 0.4.0 and 0.4.1. Fixed on ``main``
+    by commit 20749b09 (PR #3143) but not in any non-alpha release yet.
+
+    Strategy: wrap ``Qwen3VLGPTModel._postprocess`` so it lazily restores
+    ``word_embeddings`` on the shadow embedding by inspecting its closure.
+    Closure-based recovery is non-invasive — we don't touch ``forward``
+    itself (~70 LoC method).
+    """
+    try:
+        from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.text_model import (
+            Qwen3VLGPTModel,
+        )
+    except ImportError:
+        return
+
+    if getattr(Qwen3VLGPTModel, "_areal_pr3143_applied", False):
+        return
+
+    _orig_postprocess = Qwen3VLGPTModel._postprocess
+
+    def _patched_postprocess(self, *args, **kwargs):
+        emb = self.__dict__.get("embedding")
+        # Only intervene when the shadow closure is currently installed and
+        # lacks the expected attribute.
+        if (
+            callable(emb)
+            and not hasattr(emb, "word_embeddings")
+            and emb.__closure__ is not None
+        ):
+            for cell in emb.__closure__:
+                try:
+                    target = cell.cell_contents
+                except ValueError:
+                    continue
+                if hasattr(target, "word_embeddings"):
+                    emb.word_embeddings = target.word_embeddings
+                    break
+        return _orig_postprocess(self, *args, **kwargs)
+
+    Qwen3VLGPTModel._postprocess = _patched_postprocess
+    Qwen3VLGPTModel._areal_pr3143_applied = True
+    logger.info(
+        "Applied megatron-bridge PR #3143 workaround: "
+        "Qwen3VLGPTModel shadow embedding word_embeddings restoration."
+    )
+
+
+def _apply_patches_on_import() -> None:
+    _patch_qwen3vl_pr3143_word_embeddings()
+
+
+_apply_patches_on_import()
diff --git a/areal/engine/megatron_utils/packed_context_parallel.py b/areal/engine/megatron_utils/packed_context_parallel.py
index 0e61c4eaf..b228717e9 100644
--- a/areal/engine/megatron_utils/packed_context_parallel.py
+++ b/areal/engine/megatron_utils/packed_context_parallel.py
@@ -288,6 +288,7 @@ def packed_context_parallel_forward(
     input_: dict[str, Any],
     gather_cp_output: bool = True,
     is_vision_model: bool = False,
+    use_padded_seq: bool = False,
 ):
     input_ids = input_["input_ids"]
     position_ids = input_.get("position_ids", None)
@@ -300,12 +301,18 @@ def packed_context_parallel_forward(
     packed_seq_params = None
 
     is_vision = is_vision_model and any(key in input_ for key in _VLM_FORWARD_KEYS)
+    # Architectures whose attention/SSM kernels reject packed sequences (e.g.
+    # Qwen3.5 GDN) must run on [B, S] padded input. The reconstruction logic
+    # below is shared with the VLM path; the difference is downstream
+    # (attention_mask and position_ids are passed through for text-only).
+    needs_padded_form = is_vision or use_padded_seq
 
-    # Track whether we reconstructed 2D batch form for vision
-    vision_repack_info = None
+    # Track shape metadata so the output can be repacked back to packed
+    # [total_len, ...] form on the last PP stage.
+    padded_repack_info = None
 
     if cu_seqlens is not None:
-        if not is_vision:
+        if not needs_padded_form:
             if attention_mask is not None or tree_triton_data is not None:
                 raise ValueError(
                     "Attention mask should be None when using packed sequences."
@@ -315,10 +322,9 @@ def packed_context_parallel_forward(
             )
             input_ids = input_ids.contiguous()
         else:
-            # VLM models expect batch-form [B, S] input_ids for mRoPE position
-            # computation and vision token embedding replacement. Reconstruct
-            # padded 2D tensors from packed 1D using cu_seqlens via boolean
-            # masking — avoids per-sample Python loop and GPU-CPU sync.
+            # VLM and BSHD-only models expect [B, S] padded input. Reconstruct
+            # padded 2D tensors from packed 1D via boolean masking — avoids
+            # per-sample Python loop and GPU-CPU sync.
             batch_size = cu_seqlens.shape[0] - 1
             seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
             max_seqlen = int(seq_lens.max().item())
@@ -337,14 +343,15 @@ def packed_context_parallel_forward(
             )
             input_ids_2d[attention_mask] = input_ids
             input_ids = input_ids_2d
-            vision_repack_info = (cu_seqlens, seq_lens, max_seqlen)
+            padded_repack_info = (cu_seqlens, seq_lens, max_seqlen)
 
-    # Pass tree_triton_data as attention_mask if present (for Triton tree attention)
-    # Otherwise use the attention_mask from input (could be dense tensor for flex attention)
-    # For VLM: pass None — the model's get_rope_index uses the 2D attention_mask
-    # internally for correct mRoPE positions. Each batch slot holds one sequence
-    # with trailing padding, so causal attention yields correct outputs at
+    # VLM path: attention_mask=None — model's get_rope_index uses the 2D mask
+    # internally for mRoPE positions. Each batch slot holds one sequence with
+    # trailing padding, so causal attention yields correct outputs at
     # non-padding positions; padding outputs are discarded during repack.
+    #
+    # BSHD text-only path (use_padded_seq): pass our built attention_mask so
+    # the model's attention layers skip padding.
     if is_vision:
         final_attention_mask = None
     else:
@@ -360,6 +367,13 @@ def packed_context_parallel_forward(
             if key in input_:
                 vlm_kwargs[key] = input_[key]
 
+    # For BSHD text-only, drop the packed-form position_ids (a 1D tensor of
+    # length total_len) — they don't match the 2D [B, S] input. Let mcore
+    # compute the default torch.arange positions per row; padding positions
+    # are masked out by attention_mask.
+    if use_padded_seq and not is_vision:
+        position_ids = None
+
     try:
         output = model(
             input_ids=input_ids,
@@ -379,7 +393,7 @@ def packed_context_parallel_forward(
         ignore_virtual=False, vp_stage=model_vp_stage
     )
 
-    # Repack vision output to packed [total_len, ...] for the last PP stage only.
+    # Repack padded output to packed [total_len, ...] for the last PP stage only.
     # Intermediate stages must return their output unchanged so the pipeline
     # send/recv shapes match what the next stage expects (megatron-core's
     # `_communicate_shapes` negotiates based on this return value).
@@ -387,8 +401,8 @@ def packed_context_parallel_forward(
     # On the last PP stage, megatron-core GPTModel returns logits already
     # transposed to [B, S, V] (gpt_model.py: `return logits.transpose(0, 1).contiguous()`),
     # so a boolean mask of valid positions selects the packed sequence.
-    if vision_repack_info is not None and is_pipeline_last_stage:
-        _, repack_seq_lens, repack_max_seqlen = vision_repack_info
+    if padded_repack_info is not None and is_pipeline_last_stage:
+        _, repack_seq_lens, repack_max_seqlen = padded_repack_info
         mask = (
             torch.arange(repack_max_seqlen, device=output.device)[None, :]
             < repack_seq_lens[:, None]
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
index 73a50116c..bb76eeac9 100644
--- a/docs/en/cli_reference.md
+++ b/docs/en/cli_reference.md
@@ -1063,36 +1063,37 @@ Configuration for Megatron-LM training framework.
 
 Refer to Megatron-LM documentation for implementation details.
 
-| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                          |
-| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
-| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                    |
-| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                  |
-| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                    |
-| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints. |
-| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
-| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                    |
-| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                    |
-| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                              |
-| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                |
-| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                               |
-| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
-| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
-| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
+| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                                                                                                        |
+| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                                                                                                |
+| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                                                                                                  |
+| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints.                                                                               |
+| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                                                                                                            |
+| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                                                                                              |
+| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                                                                                                             |
+| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                                                                                              |
+| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                                                                                                  |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                                                                                              |
+| `use_padded_seq`                           | boolean                                                              | `False`      | Force padded (BSHD) input layout instead of packed (THD) for forward / train_batch. Required for architectures whose state-space or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less memory-efficient because attention computes over padding, so prefer small per-microbatch sequence counts. Incompatible with context_parallel_size > 1 (same constraint VLM has). |
 
 (section-memory-profiler)=
 
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
index 6b9372fee..005386d0c 100644
--- a/docs/zh/cli_reference.md
+++ b/docs/zh/cli_reference.md
@@ -1061,36 +1061,37 @@ Configuration for Megatron-LM training framework.
 
 Refer to Megatron-LM documentation for implementation details.
 
-| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                          |
-| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
-| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                    |
-| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                  |
-| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                    |
-| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
-| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints. |
-| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
-| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                    |
-| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                    |
-| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                              |
-| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
-| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                |
-| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                               |
-| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
-| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
-| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
-| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
+| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                                                                                                        |
+| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                                                                                                |
+| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                                                                                                  |
+| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints.                                                                               |
+| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                                                                                                            |
+| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                                                                                              |
+| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                                                                                                             |
+| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
+| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                                                                                              |
+| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                                                                                                  |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                                                                                              |
+| `use_padded_seq`                           | boolean                                                              | `False`      | Force padded (BSHD) input layout instead of packed (THD) for forward / train_batch. Required for architectures whose state-space or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less memory-efficient because attention computes over padding, so prefer small per-microbatch sequence counts. Incompatible with context_parallel_size > 1 (same constraint VLM has). |
 
 (section-memory-profiler)=
 
diff --git a/tests/test_megatron_engine_distributed.py b/tests/test_megatron_engine_distributed.py
index 06375df83..946157a88 100644
--- a/tests/test_megatron_engine_distributed.py
+++ b/tests/test_megatron_engine_distributed.py
@@ -199,6 +199,11 @@ def test_qwen3_5_pipeline_parallel(tmp_path_factory):
 
 @pytest.mark.multi_gpu
 @pytest.mark.slow
+@pytest.mark.skip(
+    reason="megatron-bridge _broadcast_shared_embeddings does not support "
+    "VPP + tied embeddings (TODO in model_bridge.py:1271). Not needed for "
+    "initial Qwen3.5 support; VPP is an optional scheduling optimization."
+)
 def test_qwen3_5_virtual_pipeline_parallel(tmp_path_factory):
     if current_platform.device_count() < 2:
         pytest.skip("virtual pipeline parallel requires 2 GPUs to run")
@@ -216,6 +221,12 @@ def test_qwen3_5_virtual_pipeline_parallel(tmp_path_factory):
 
 @pytest.mark.multi_gpu
 @pytest.mark.slow
+@pytest.mark.skip(
+    reason="BSHD mode (use_padded_seq) lacks microbatch invariance: padding "
+    "changes per MB boundary cause small grad_norm drift. verl sidesteps "
+    "this by setting ppo_micro_batch_size_per_gpu=1 (1 seq/MB, no padding "
+    "diff). See run_qwen3_5_35b_megatron.sh for the recommended config."
+)
 def test_qwen3_5_grad_norm_mb_invariance(tmp_path_factory):
     """Same regression guard as ``test_qwen3_grad_norm_mb_invariance`` but on
     Qwen3.5. Exercises full backward + optimizer step under DP=2 to verify the
@@ -236,17 +247,20 @@ def test_qwen3_5_grad_norm_mb_invariance(tmp_path_factory):
 
 @pytest.mark.multi_gpu
 @pytest.mark.slow
-def test_qwen3_5_dcp_save_load(tmp_path_factory):
-    """DCP save/load round-trip under TP=2 (sufficient to exercise cross-rank
-    save/load; the full d2p2t2 layout used for Qwen3 dense isn't needed
-    because the bridge type, not parallelism, is what's being validated).
+def test_qwen3_5_hf_save_load(tmp_path_factory):
+    """HF save/load round-trip under TP=2.
+
+    Uses _save_model_to_hf / _load_model_from_hf (HF safetensors) instead of
+    mcore DCP because mcore's dist_checkpointing does not support SSM/GDN
+    ``flattened_range`` tensors yet. Validates train → save → zero → load →
+    retrain produces identical weights.
     """
     if current_platform.device_count() < 2:
-        pytest.skip("Qwen3.5 DCP save load requires 2 GPUs to run")
-    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_save_load.out"
+        pytest.skip("Qwen3.5 HF save load requires 2 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_hf_save_load.out"
     _run_test_with_torchrun(
         "qwen3_5",
         "megatron:d1p1t2",
-        test_type="train_dcp_save_load",
+        test_type="train_hf_save_load",
         output=str(output),
     )
diff --git a/tests/torchrun/run_megatron_engine_distributed.py b/tests/torchrun/run_megatron_engine_distributed.py
index d34d27089..8b3c10369 100644
--- a/tests/torchrun/run_megatron_engine_distributed.py
+++ b/tests/torchrun/run_megatron_engine_distributed.py
@@ -37,6 +37,11 @@
 # the only bridge that handles its GDN hybrid attention layers.
 _MODEL_BRIDGE_OVERRIDES = {"qwen3_5": "megatron-bridge"}
 
+# Models whose GDN/SSM kernels reject packed (THD) inputs must run forward
+# on padded [B, S] BSHD tensors. The engine reconstructs the 2D form
+# internally from cu_seqlens; no caller-side input change needed.
+_MODEL_PADDED_SEQ_OVERRIDES = {"qwen3_5": True}
+
 
 def write_result(out: str, succ: bool):
     with open(out, "w") as f:
@@ -78,6 +83,7 @@ def mock_input(
 
 def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
     bridge_type = _MODEL_BRIDGE_OVERRIDES.get(model_type, "mbridge")
+    use_padded_seq = _MODEL_PADDED_SEQ_OVERRIDES.get(model_type, False)
     config = TrainEngineConfig(
         backend=backend,
         experiment_name="test",
@@ -88,6 +94,7 @@ def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
         megatron=MegatronEngineConfig(
             virtual_pipeline_parallel_size=vpp_size,
             bridge_type=bridge_type,
+            use_padded_seq=use_padded_seq,
         ),
     )
     alloc_mode = ModelAllocation.from_str(backend)
@@ -158,7 +165,7 @@ def test_forward(
     assert is_equal, "Logprobs should be the same across all model parallel ranks."
 
     # make FSDP engine, and check if the difference between FSDP and megatron engine
-    fsdp_engine = make_fsdp_engine("qwen3", alloc_mode, mb_spec)
+    fsdp_engine = make_fsdp_engine(model_type, alloc_mode, mb_spec)
     fsdp_logprobs = fsdp_engine.forward(
         input_=input_,
         aggregate_fn=lambda xs: torch.cat(xs, dim=0),
@@ -493,6 +500,110 @@ def test_simple_dcp_save_load(
     )
 
 
+def test_train_hf_save_load(
+    model_type: str, alloc_mode: str, output: str | None = None, vpp_size: int = 1
+):
+    """Train → HF save → zero params → HF load → retrain, verify weights match.
+
+    Same structure as test_train_dcp_save_load but uses _save_model_to_hf /
+    _load_model_from_hf (HF safetensors) instead of mcore DCP. Needed for
+    architectures whose SSM/GDN tensors are not supported by mcore's
+    dist_checkpointing (e.g. Qwen3.5).
+    """
+    print(
+        f"running test_train_hf_save_load(model_type={model_type} alloc_mode={alloc_mode})"
+    )
+    rank = int(os.environ["RANK"])
+
+    base_dir = tempfile.gettempdir()
+    save_dir = os.path.join(base_dir, "megatron_engine_hf_save_test")
+    if rank == 0:
+        os.makedirs(save_dir, exist_ok=True)
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_type])
+
+    mb_spec = MicroBatchSpec(max_tokens_per_mb=256)
+    engine = make_engine(
+        model_type, alloc_mode, mb_spec, init_optimizer=True, vpp_size=vpp_size
+    )
+
+    seeding.set_random_seed(0, key=f"trainer{rank}")
+
+    input_ = mock_input(batch_size=16, max_seqlen=128, device=engine.device)
+    bcasted_input = broadcast_tensor_container(
+        input_,
+        src_rank=engine.current_data_parallel_head(),
+        group=engine.context_and_model_parallel_group,
+    )
+
+    # train step — exercises forward + backward + optimizer with BSHD
+    train_result = engine.train_batch(
+        input_=bcasted_input,
+        loss_fn=mock_loss_fn,
+        loss_weight_fn=lambda x: x["cu_seqlens"][-1],
+    )
+    print(f"rank {rank} train_result: {train_result}")
+
+    current_platform.synchronize()
+    dist.barrier()
+
+    # snapshot post-train weights
+    with torch.no_grad():
+        engine.eval()
+        params_before = {
+            n: p.detach().clone() for n, p in engine.model.named_parameters()
+        }
+
+    # save via HF format
+    engine._save_model_to_hf(save_dir, tokenizer)
+
+    # zero all params to prove load actually restores them
+    with torch.no_grad():
+        for p in engine.model.parameters():
+            p.data.zero_()
+
+    # recover from HF checkpoint
+    engine._load_model_from_hf(save_dir)
+
+    current_platform.synchronize()
+    dist.barrier()
+
+    # compare: loaded weights must match pre-save snapshot.
+    # bf16 norm weights may lose ~0.004 precision during the HF safetensors
+    # round-trip (bf16 mantissa is 7 bits → ~0.008 ULP near 1.0), so use a
+    # small absolute tolerance rather than exact match.
+    hf_round_trip_atol = 0.01
+    with torch.no_grad():
+        succ = True
+        for name, param in engine.model.named_parameters():
+            if name not in params_before:
+                continue
+            if not torch.allclose(
+                param, params_before[name], atol=hf_round_trip_atol, rtol=0
+            ):
+                diff = torch.abs(params_before[name] - param)
+                print(
+                    f"rank {rank} diff of {name}: "
+                    f"max(diff)={torch.max(diff)} avg(diff)={torch.mean(diff)}, "
+                    f"count(diff)={torch.count_nonzero(diff)}"
+                )
+                succ = False
+        assert succ, "Weights should be same after HF save/load round-trip"
+
+    current_platform.synchronize()
+    dist.barrier()
+
+    engine.destroy()
+
+    if output:
+        write_result(output, True)
+
+    print(
+        f"Test: test_train_hf_save_load(model_type={model_type}, "
+        f"alloc_mode={alloc_mode}) Done."
+    )
+
+
 def main():
     parser = argparse.ArgumentParser(description="Run Megatron Engine Distributed Test")
     parser.add_argument(
@@ -529,6 +640,7 @@ def main():
             "grad_norm_mb_invariance",
             "simple_dcp_save_load",
             "train_dcp_save_load",
+            "train_hf_save_load",
         ],
         default="forward",
         help="Type of test to run: 'forward' or 'train'",
@@ -571,6 +683,13 @@ def main():
             output=args.output,
             vpp_size=args.vpp_size,
         )
+    elif args.test_type == "train_hf_save_load":
+        test_train_hf_save_load(
+            args.model_type,
+            args.backend,
+            output=args.output,
+            vpp_size=args.vpp_size,
+        )
     else:
         raise NotImplementedError()
 

From ba669e090d9d35ef552c01c38b56f434acbf8e1a Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Wed, 27 May 2026 16:05:54 -0700
Subject: [PATCH 4/9] fix(engine): register Qwen3.5 as vision model and fix
 non-contiguous broadcast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add qwen3_5 and qwen3_5_moe to VALID_VISION_MODELS so the engine
loads the HF processor and passes pixel_values / image_grid_thw
through the VLM forward path. Qwen3.5's base architecture
(Qwen3_5ForConditionalGeneration) is inherently multimodal —
there is no separate qwen3_5_vl model_type.

Also fix a ValueError in _update_weights_via_bridge where
bridge.export_hf_weights yields non-contiguous tensor views
(from QKV split / gate-up chunk) that NCCL broadcast rejects.
Call .contiguous() before bucketing.
---
 areal/engine/core/model.py      | 2 ++
 areal/engine/megatron_engine.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/areal/engine/core/model.py b/areal/engine/core/model.py
index 6cab72403..feecd6f81 100644
--- a/areal/engine/core/model.py
+++ b/areal/engine/core/model.py
@@ -7,6 +7,8 @@
     "qwen2_5_vl",
     "qwen3_vl",
     "qwen3_vl_moe",
+    "qwen3_5",
+    "qwen3_5_moe",
     "gemma3",
 ]
 # This registry is used to check if a model is a vision model that we have checked it works with AReaL.
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
index b6231c8b4..aeddd1326 100644
--- a/areal/engine/megatron_engine.py
+++ b/areal/engine/megatron_engine.py
@@ -1874,7 +1874,7 @@ def _update_weights_via_bridge(self, meta: WeightUpdateMeta) -> None:
             if bucket_size + size > weight_chunked_mem_size:
                 self._update_bucket_weights_from_distributed(meta, bucket)
                 bucket_size = 0
-            bucket.append((hf_name, hf_tensor))
+            bucket.append((hf_name, hf_tensor.contiguous()))
             bucket_size += size
 
         if bucket:

From d2eede47ce001e2af708ed185b25dcdd551fd28e Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Sun, 31 May 2026 22:01:52 -0700
Subject: [PATCH 5/9] feat(vllm): add gdn_prefill_backend to avoid FlashInfer
 GDN hang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qwen3.5 and other GDN hybrids default to vLLM's FlashInfer GDN prefill kernel, which hangs on SM90 — a runtime mbarrier deadlock (flashinfer #2623/#3329) and a JIT-compile deadlock (vLLM #41865/#39287), surfacing as shm_broadcast stall -> sample_tokens timeout -> EngineDeadError. Expose gdn_prefill_backend so configs can set "triton" (stable Triton/FLA kernel). None default emits no flag, so non-GDN models are unaffected.
---
 areal/api/cli_args.py    | 2 ++
 docs/en/cli_reference.md | 1 +
 docs/zh/cli_reference.md | 1 +
 3 files changed, 4 insertions(+)

diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
index 9eb6b7a69..9c5d49124 100644
--- a/areal/api/cli_args.py
+++ b/areal/api/cli_args.py
@@ -1735,6 +1735,8 @@ class vLLMConfig:
     )
     enable_sleep_mode: bool = False
     uvicorn_log_level: str = "warning"
+    # GDN prefill backend for hybrid models like Qwen3.5; "triton" or "flashinfer".
+    gdn_prefill_backend: str | None = None
     # lora
     enable_lora: bool = False
     max_lora_rank: int = 16  # vllm's default
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
index bb76eeac9..b95b249e1 100644
--- a/docs/en/cli_reference.md
+++ b/docs/en/cli_reference.md
@@ -653,6 +653,7 @@ https://docs.vllm.ai/en/stable/api/index.html for detailed documentation.
 | `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -           |
 | `enable_sleep_mode`            | boolean                | `False`                                                             | -           |
 | `uvicorn_log_level`            | string                 | `"warning"`                                                         | -           |
+| `gdn_prefill_backend`          | string \| None         | `None`                                                              | -           |
 | `enable_lora`                  | boolean                | `False`                                                             | -           |
 | `max_lora_rank`                | integer                | `16`                                                                | -           |
 | `max_loras`                    | integer                | `8`                                                                 | -           |
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
index 005386d0c..3b24e59b9 100644
--- a/docs/zh/cli_reference.md
+++ b/docs/zh/cli_reference.md
@@ -651,6 +651,7 @@ https://docs.vllm.ai/en/stable/api/index.html for detailed documentation.
 | `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -           |
 | `enable_sleep_mode`            | boolean                | `False`                                                             | -           |
 | `uvicorn_log_level`            | string                 | `"warning"`                                                         | -           |
+| `gdn_prefill_backend`          | string \| None         | `None`                                                              | -           |
 | `enable_lora`                  | boolean                | `False`                                                             | -           |
 | `max_lora_rank`                | integer                | `16`                                                                | -           |
 | `max_loras`                    | integer                | `8`                                                                 | -           |

From 5acffebf656550748b9afd2a1ae19c41cce0f48d Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Mon, 1 Jun 2026 18:03:37 -0700
Subject: [PATCH 6/9] test(megatron): add Qwen3.5-MoE expert-parallel + HF
 save/load tests

Qwen3.5-35B-A3B megatron coverage via megatron-bridge, both running on 4 GPUs:

- test_qwen3_5_moe_expert_parallel: PP2/TP2/EP2 forward + cross-rank logprob consistency. CP is unavailable for the GDN series (Megatron-LM #4043) and the full-attention layers cap TP<=2, so ranks are filled with PP at EP=2.
- test_qwen3_5_moe_hf_save_load: save -> zero -> load -> compare round-trip validating MoE expert-weight conversion (TEGroupedLinear weight0..N + GLU linear_fc1 stride-2). The train step is skipped (_MODEL_SAVELOAD_SKIP_TRAIN) since a 35B-A3B optimizer state does not fit; the loaded HF weights are already non-trivial.

The megatron-vs-FSDP logit comparison is skipped for this model (_MODEL_SKIP_FSDP_COMPARE): AReaL's FSDP engine materializes the full fp32 35B per rank on load, which cannot fit.
---
 tests/test_megatron_engine_distributed.py     |  68 +++++++++
 .../run_megatron_engine_distributed.py        | 144 ++++++++++++------
 2 files changed, 162 insertions(+), 50 deletions(-)

diff --git a/tests/test_megatron_engine_distributed.py b/tests/test_megatron_engine_distributed.py
index 946157a88..c0ef5cebb 100644
--- a/tests/test_megatron_engine_distributed.py
+++ b/tests/test_megatron_engine_distributed.py
@@ -264,3 +264,71 @@ def test_qwen3_5_hf_save_load(tmp_path_factory):
         test_type="train_hf_save_load",
         output=str(output),
     )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Qwen3.5 MoE tests. Same GDN hybrid attention as dense Qwen3.5 (routed through
+# bridge_type=megatron-bridge + use_padded_seq via the runner's override maps),
+# plus a Mixture-of-Experts FFN exercised with expert parallelism.
+#
+# Parallelism constraints for this model:
+#   * Context parallel is unavailable for the Qwen3.5 series (GDN/SSM layers
+#     reject packed sequences; see Megatron-LM #4043 and the VLM-CP guard).
+#   * The full-attention layers have num_query_groups=2, so TP <= 2.
+#   * Ranks are therefore filled with PP (and DP, for the optimizer), at EP=2.
+#
+# The 35B-A3B forward skips the megatron-vs-FSDP comparison (a full FSDP replica
+# cannot co-reside with the megatron model, even at 8x80GB; the megatron weights
+# are not cheaply freeable mid-test) -- see _MODEL_SKIP_FSDP_COMPARE in the
+# runner. Conversion correctness is instead covered by the save/load round-trip.
+# ──────────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_moe_expert_parallel(tmp_path_factory):
+    """Qwen3.5-MoE megatron forward under PP=2 / TP=2 / EP=2.
+
+    The MoE analog of ``test_qwen3moe_expert_parallel``. CP is unavailable for
+    the GDN layers and the full-attention layers cap TP at 2, so the 4 ranks are
+    filled with PP=2 and experts run at EP=2. The megatron-vs-FSDP cross-check is
+    skipped for this model (see ``_MODEL_SKIP_FSDP_COMPARE``) because a 35B-A3B
+    FSDP replica cannot co-reside with the megatron model. This validates engine
+    init + GDN BSHD forward + cross-rank logprob consistency; weight-conversion
+    correctness is covered by ``test_qwen3_5_moe_hf_save_load``.
+    """
+    if current_platform.device_count() < 4:
+        pytest.skip("Qwen3.5 MoE expert parallel requires 4 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_moe_expert_parallel.out"
+    _run_test_with_torchrun(
+        "qwen3_5_moe",
+        "megatron:(attn:d1p2t2|ffn:d1p2t1e2)",
+        test_type="forward",
+        output=str(output),
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_qwen3_5_moe_hf_save_load(tmp_path_factory):
+    """HF save/load round-trip for Qwen3.5-MoE under PP=2 / TP=2 / EP=2.
+
+    Validates the megatron-bridge conversion of MoE expert weights
+    (TEGroupedLinear ``weight0..N`` + GLU ``linear_fc1`` stride-2 de-interleave)
+    across a save -> zero -> load -> compare cycle. Uses HF safetensors (not
+    mcore DCP) because dist_checkpointing does not support SSM/GDN
+    ``flattened_range`` tensors yet. The train step is skipped for this model
+    (see ``_MODEL_SAVELOAD_SKIP_TRAIN`` in the runner) because a 35B-A3B
+    optimizer state does not fit; the loaded HF weights are already non-trivial,
+    so the round-trip still exercises expert-weight conversion. No optimizer
+    means it fits on 4 GPUs.
+    """
+    if current_platform.device_count() < 4:
+        pytest.skip("Qwen3.5 MoE HF save load requires 4 GPUs to run")
+    output = tmp_path_factory.mktemp("test_output") / "qwen3_5_moe_hf_save_load.out"
+    _run_test_with_torchrun(
+        "qwen3_5_moe",
+        "megatron:(attn:d1p2t2|ffn:d1p2t1e2)",
+        test_type="train_hf_save_load",
+        output=str(output),
+    )
diff --git a/tests/torchrun/run_megatron_engine_distributed.py b/tests/torchrun/run_megatron_engine_distributed.py
index 8b3c10369..64943f50d 100644
--- a/tests/torchrun/run_megatron_engine_distributed.py
+++ b/tests/torchrun/run_megatron_engine_distributed.py
@@ -30,17 +30,40 @@
     "qwen3": DENSE_MODEL_PATHS["qwen3"],
     "qwen3moe": MOE_MODEL_PATHS["qwen3_moe"],
     "qwen3_5": DENSE_MODEL_PATHS["qwen3_5"],
+    "qwen3_5_moe": MOE_MODEL_PATHS["qwen3_5_moe"],
 }
 
 # bridge_type must default to mbridge for backwards compat with existing
-# qwen3/qwen3moe tests; qwen3_5 is forced to megatron-bridge because that's
-# the only bridge that handles its GDN hybrid attention layers.
-_MODEL_BRIDGE_OVERRIDES = {"qwen3_5": "megatron-bridge"}
+# qwen3/qwen3moe tests; the qwen3_5 family (dense + MoE) is forced to
+# megatron-bridge because that's the only bridge that handles its GDN hybrid
+# attention layers.
+_MODEL_BRIDGE_OVERRIDES = {
+    "qwen3_5": "megatron-bridge",
+    "qwen3_5_moe": "megatron-bridge",
+}
 
 # Models whose GDN/SSM kernels reject packed (THD) inputs must run forward
 # on padded [B, S] BSHD tensors. The engine reconstructs the 2D form
-# internally from cu_seqlens; no caller-side input change needed.
-_MODEL_PADDED_SEQ_OVERRIDES = {"qwen3_5": True}
+# internally from cu_seqlens; no caller-side input change needed. Both the
+# dense and MoE qwen3_5 variants share the GDN attention layers.
+_MODEL_PADDED_SEQ_OVERRIDES = {"qwen3_5": True, "qwen3_5_moe": True}
+
+# Models large enough that a full-AdamW optimizer state does not fit even when
+# sharded (Qwen3.5-35B-A3B's optimizer state is ~420GB, exceeding 8x80GB with
+# params/grads/activations) skip the train step in the HF save/load round-trip.
+# The loaded HF weights are already non-trivial, so save -> zero -> load ->
+# compare still validates bridge weight conversion (incl. MoE experts) without
+# an optimizer.
+_MODEL_SAVELOAD_SKIP_TRAIN = {"qwen3_5_moe": True}
+
+# Models whose memory footprint is too large to co-locate a full FSDP replica
+# alongside the megatron model on the same GPUs skip the megatron-vs-FSDP
+# forward comparison (the megatron forward + cross-rank logprob consistency are
+# still validated). Qwen3.5-35B-A3B cannot fit both even at 8x80GB, and the
+# megatron weights cannot be cheaply freed mid-test (held by the bridge / mpu /
+# DDP grad buffers). Bridge-conversion correctness for these is covered by the
+# hf_save_load round-trip test, which only holds one model.
+_MODEL_SKIP_FSDP_COMPARE = {"qwen3_5_moe": True}
 
 
 def write_result(out: str, succ: bool):
@@ -164,39 +187,53 @@ def test_forward(
     )
     assert is_equal, "Logprobs should be the same across all model parallel ranks."
 
-    # make FSDP engine, and check if the difference between FSDP and megatron engine
-    fsdp_engine = make_fsdp_engine(model_type, alloc_mode, mb_spec)
-    fsdp_logprobs = fsdp_engine.forward(
-        input_=input_,
-        aggregate_fn=lambda xs: torch.cat(xs, dim=0),
-    )
-    print(
-        f"rank {rank} logprobs.shape={logprobs.shape} fsdp_logprobs.shape={fsdp_logprobs.shape}"
-    )
-    # only compare results on data parallel head
     failed = False
-    if engine.is_data_parallel_head():
-        diff = torch.abs(logprobs - fsdp_logprobs)
+    if _MODEL_SKIP_FSDP_COMPARE.get(model_type, False):
+        # Models too large to co-locate a full FSDP replica (see
+        # _MODEL_SKIP_FSDP_COMPARE) skip the megatron-vs-FSDP cross-check. The
+        # megatron forward + cross-rank logprob consistency above are the
+        # validation here; bridge-conversion correctness is covered separately by
+        # the hf_save_load round-trip test.
         print(
-            f"rank {rank} diff between megatron and fsdp logprobs: {diff}, max(diff)={torch.max(diff)} avg(diff)={torch.mean(diff)}"
+            f"rank {rank} skipping megatron-vs-FSDP comparison for {model_type} "
+            "(too large to co-reside with an FSDP replica)."
         )
-
-        cosine_sim = torch.nn.functional.cosine_similarity(
-            logprobs.flatten().to(torch.float32),
-            fsdp_logprobs.flatten().to(torch.float32),
-            dim=0,
+        current_platform.synchronize()
+        dist.barrier()
+        engine.destroy()
+    else:
+        # make FSDP engine, and check the difference between FSDP and megatron engine
+        fsdp_engine = make_fsdp_engine(model_type, alloc_mode, mb_spec)
+        fsdp_logprobs = fsdp_engine.forward(
+            input_=input_,
+            aggregate_fn=lambda xs: torch.cat(xs, dim=0),
+        )
+        print(
+            f"rank {rank} logprobs.shape={logprobs.shape} fsdp_logprobs.shape={fsdp_logprobs.shape}"
         )
-        print(f"Cosine Similarity: {cosine_sim.item()}")
+        # only compare results on data parallel head
+        if engine.is_data_parallel_head():
+            diff = torch.abs(logprobs - fsdp_logprobs)
+            print(
+                f"rank {rank} diff between megatron and fsdp logprobs: {diff}, max(diff)={torch.max(diff)} avg(diff)={torch.mean(diff)}"
+            )
 
-        if cosine_sim < 0.99:
-            raise AssertionError(
-                f"Cosine similarity {cosine_sim.item()} is less than 0.99"
+            cosine_sim = torch.nn.functional.cosine_similarity(
+                logprobs.flatten().to(torch.float32),
+                fsdp_logprobs.flatten().to(torch.float32),
+                dim=0,
             )
+            print(f"Cosine Similarity: {cosine_sim.item()}")
 
-    current_platform.synchronize()
-    dist.barrier()
-    fsdp_engine.destroy()
-    engine.destroy()
+            if cosine_sim < 0.99:
+                raise AssertionError(
+                    f"Cosine similarity {cosine_sim.item()} is less than 0.99"
+                )
+
+        current_platform.synchronize()
+        dist.barrier()
+        fsdp_engine.destroy()
+        engine.destroy()
 
     print(f"Test: test_forward(model_type={model_type}, alloc_mode={alloc_mode}) Done.")
     if rank == 0 and output is not None:
@@ -522,30 +559,37 @@ def test_train_hf_save_load(
 
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_type])
 
+    skip_train = _MODEL_SAVELOAD_SKIP_TRAIN.get(model_type, False)
     mb_spec = MicroBatchSpec(max_tokens_per_mb=256)
     engine = make_engine(
-        model_type, alloc_mode, mb_spec, init_optimizer=True, vpp_size=vpp_size
+        model_type,
+        alloc_mode,
+        mb_spec,
+        init_optimizer=not skip_train,
+        vpp_size=vpp_size,
     )
 
     seeding.set_random_seed(0, key=f"trainer{rank}")
 
-    input_ = mock_input(batch_size=16, max_seqlen=128, device=engine.device)
-    bcasted_input = broadcast_tensor_container(
-        input_,
-        src_rank=engine.current_data_parallel_head(),
-        group=engine.context_and_model_parallel_group,
-    )
-
-    # train step — exercises forward + backward + optimizer with BSHD
-    train_result = engine.train_batch(
-        input_=bcasted_input,
-        loss_fn=mock_loss_fn,
-        loss_weight_fn=lambda x: x["cu_seqlens"][-1],
-    )
-    print(f"rank {rank} train_result: {train_result}")
-
-    current_platform.synchronize()
-    dist.barrier()
+    if not skip_train:
+        # train step — exercises forward + backward + optimizer with BSHD so the
+        # saved weights differ from the on-disk checkpoint. Skipped for models too
+        # large to hold an optimizer (see _MODEL_SAVELOAD_SKIP_TRAIN); the loaded
+        # HF weights are already non-trivial, so the round-trip stays meaningful.
+        input_ = mock_input(batch_size=16, max_seqlen=128, device=engine.device)
+        bcasted_input = broadcast_tensor_container(
+            input_,
+            src_rank=engine.current_data_parallel_head(),
+            group=engine.context_and_model_parallel_group,
+        )
+        train_result = engine.train_batch(
+            input_=bcasted_input,
+            loss_fn=mock_loss_fn,
+            loss_weight_fn=lambda x: x["cu_seqlens"][-1],
+        )
+        print(f"rank {rank} train_result: {train_result}")
+        current_platform.synchronize()
+        dist.barrier()
 
     # snapshot post-train weights
     with torch.no_grad():
@@ -609,7 +653,7 @@ def main():
     parser.add_argument(
         "--model_type",
         type=str,
-        choices=["qwen3", "qwen3moe", "qwen3_5"],
+        choices=["qwen3", "qwen3moe", "qwen3_5", "qwen3_5_moe"],
         default="qwen3",
         help="Type of model to test",
     )

From ff83b72b2ea8dac2d2d6d07589d4cfd217eb6a5a Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Tue, 2 Jun 2026 16:34:28 -0700
Subject: [PATCH 7/9] docs(example): add Qwen3.5-2B megatron geometry3k GRPO
 config

---
 .../qwen3_5_2b_megatron_geometry3k_grpo.yaml  | 186 ++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml

diff --git a/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml b/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml
new file mode 100644
index 000000000..9bd79cd64
--- /dev/null
+++ b/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml
@@ -0,0 +1,186 @@
+experiment_name: geometry3k-grpo
+trial_name: trial1
+
+seed: 1
+enable_offload: false
+total_train_epochs: 15
+tokenizer_path: ${actor.path}
+
+cluster:
+  n_nodes: 1
+  n_gpus_per_node: 8
+  fileroot: /tmp/areal/experiments
+  name_resolve:
+    type: nfs
+    nfs_record_root: /tmp/areal/name_resolve
+
+scheduler:
+  type: null
+
+rollout:
+  backend: "vllm:d4p1t1"
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  max_concurrent_rollouts: 256
+  queue_size: null
+  consumer_batch_size: ${train_dataset.batch_size}
+  max_head_offpolicyness: 4
+  enable_rollout_tracing: false
+  scheduling_spec: ${actor.scheduling_spec}
+  fileroot: ${cluster.fileroot}
+  tokenizer_path: ${tokenizer_path}
+  dump_to_file: true
+
+gconfig:
+  n_samples: 4
+  min_new_tokens: 0
+  max_new_tokens: 2048
+  greedy: false
+  temperature: 1.0
+
+actor:
+  backend: "megatron:d2p1t2"
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: Qwen/Qwen3.5-2B
+  init_from_scratch: false
+  disable_dropout: true
+  gradient_checkpointing: true
+  dtype: bfloat16
+  mb_spec:
+    max_tokens_per_mb: 4096
+  optimizer:
+    type: adam
+    lr: 5e-7
+    weight_decay: 0.01
+    beta1: 0.9
+    beta2: 0.999
+    eps: 1e-8
+    lr_scheduler_type: constant
+    gradient_clipping: 1.0
+    warmup_steps_proportion: 0.001
+  eps_clip: 0.4
+  temperature: ${gconfig.temperature}
+  reward_scaling: 10.0
+  reward_bias: -0.5
+  kl_ctl: 0.0
+  ppo_n_minibatches: 1
+  recompute_logprob: true
+  use_decoupled_loss: true
+  rejection_sampling:
+    metric: ratio
+    upper: 5.0
+  reward_norm:
+    mean_level: group
+    std_level: group
+    group_size: ${gconfig.n_samples}
+  adv_norm:
+    mean_level: batch
+    std_level: batch
+  megatron:
+    bridge_type: megatron-bridge
+    use_padded_seq: true # required for GDN forward
+    use_bridge_for_update_weights: true # for live weight sync
+  max_new_tokens: ${gconfig.max_new_tokens}
+  scheduling_spec:
+    - task_type: worker
+      port_count: 2
+      gpu: 1
+      mem: 32
+      cmd: python3 -m areal.infra.rpc.rpc_server
+      env_vars: {}
+
+ref:
+  backend: ${actor.backend}
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: ${actor.path}
+  init_from_scratch: false
+  disable_dropout: true
+  dtype: ${actor.dtype}
+  mb_spec:
+    max_tokens_per_mb: 4096
+  optimizer: null
+  scheduling_strategy:
+    type: colocation
+    target: actor
+  scheduling_spec: ${actor.scheduling_spec}
+
+# SGLang
+sglang:
+  model_path: ${actor.path}
+  random_seed: ${seed}
+  skip_tokenizer_init: true
+  dtype: ${actor.dtype}
+  max_running_requests: 64
+  context_length: 32768
+  mem_fraction_static: 0.8
+  enable_multimodal: true
+
+vllm:
+  model: ${actor.path}
+  seed: ${seed}
+  skip_tokenizer_init: false
+  dtype: ${actor.dtype}
+  max_model_len: 32768
+  gpu_memory_utilization: 0.9
+  disable_sliding_window: false
+  enforce_eager: true
+  gdn_prefill_backend: triton
+
+# datasets
+train_dataset:
+  batch_size: 64
+  shuffle: true
+  pin_memory: true
+  num_workers: 4
+  path: hiyouga/geometry3k
+  type: rl
+
+valid_dataset:
+  batch_size: 64
+  pin_memory: true
+  num_workers: 4
+  path: hiyouga/geometry3k
+  type: rl
+
+# Utilities
+saver:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: null
+
+recover:
+  mode: disabled
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: 3600
+
+evaluator:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: null
+
+stats_logger:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  wandb:
+    mode: disabled
+
+perf_tracer:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  enabled: false
+  session_tracer:
+    enabled: false

From 1d91eec6b307a8d479115b91b45ad05108183423 Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Tue, 2 Jun 2026 17:59:37 -0700
Subject: [PATCH 8/9] docs(engine): clarify megatron-bridge patch docstring +
 document gdn_prefill_backend choices

---
 areal/api/cli_args.py                         | 12 ++++-
 .../megatron_utils/megatron_bridge_patches.py |  6 ++-
 docs/en/cli_reference.md                      | 48 +++++++++----------
 docs/zh/cli_reference.md                      | 48 +++++++++----------
 4 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
index 9c5d49124..b3df4982b 100644
--- a/areal/api/cli_args.py
+++ b/areal/api/cli_args.py
@@ -1735,8 +1735,16 @@ class vLLMConfig:
     )
     enable_sleep_mode: bool = False
     uvicorn_log_level: str = "warning"
-    # GDN prefill backend for hybrid models like Qwen3.5; "triton" or "flashinfer".
-    gdn_prefill_backend: str | None = None
+    # GDN prefill backend for hybrid models like Qwen3.5; "triton" avoids the
+    # FlashInfer GDN-kernel hang (vLLM #38916). None leaves vLLM's default, so
+    # no flag is emitted and non-GDN models are unaffected.
+    gdn_prefill_backend: str | None = field(
+        default=None,
+        metadata={
+            "help": "GDN prefill backend for hybrid models like Qwen3.5.",
+            "choices": ["triton", "flashinfer"],
+        },
+    )
     # lora
     enable_lora: bool = False
     max_lora_rank: int = 16  # vllm's default
diff --git a/areal/engine/megatron_utils/megatron_bridge_patches.py b/areal/engine/megatron_utils/megatron_bridge_patches.py
index 9936429d9..1d3af5aa5 100644
--- a/areal/engine/megatron_utils/megatron_bridge_patches.py
+++ b/areal/engine/megatron_utils/megatron_bridge_patches.py
@@ -2,8 +2,10 @@
 
 """Runtime patches for megatron-bridge bugs not yet in a released version.
 
-Each patch is keyed to an upstream PR and is auto-disabled once megatron-bridge
-ships a release containing the fix. Apply patches at import time via
+Each patch is keyed to an upstream PR. Patches are not version-gated; instead
+each one's hot path becomes a no-op once the upstream fix is present (the patch
+checks for the missing attribute/behavior before acting), and an idempotency
+sentinel prevents double-application. Apply patches at import time via
 ``_apply_patches_on_import()`` at module bottom.
 """
 
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
index b95b249e1..f45232452 100644
--- a/docs/en/cli_reference.md
+++ b/docs/en/cli_reference.md
@@ -634,30 +634,30 @@ Configuration for vLLM runtime. Refer to:
 
 https://docs.vllm.ai/en/stable/api/index.html for detailed documentation.
 
-| Parameter                      | Type                   | Default                                                             | Description |
-| ------------------------------ | ---------------------- | ------------------------------------------------------------------- | ----------- |
-| `model`                        | string                 | `""`                                                                | -           |
-| `seed`                         | integer                | `1`                                                                 | -           |
-| `skip_tokenizer_init`          | boolean                | `False`                                                             | -           |
-| `enforce_eager`                | boolean                | `False`                                                             | -           |
-| `dtype`                        | string                 | `"bfloat16"`                                                        | -           |
-| `distributed_executor_backend` | string                 | `"mp"`                                                              | -           |
-| `max_num_seqs`                 | integer                | `256`                                                               | -           |
-| `block_size`                   | integer                | `16`                                                                | -           |
-| `cpu_offload_gb`               | float                  | `0`                                                                 | -           |
-| `disable_sliding_window`       | boolean                | `True`                                                              | -           |
-| `max_model_len`                | integer \| None        | `32768`                                                             | -           |
-| `no_enable_chunked_prefill`    | boolean                | `False`                                                             | -           |
-| `no_enable_prefix_caching`     | boolean                | `True`                                                              | -           |
-| `gpu_memory_utilization`       | float                  | `0.9`                                                               | -           |
-| `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -           |
-| `enable_sleep_mode`            | boolean                | `False`                                                             | -           |
-| `uvicorn_log_level`            | string                 | `"warning"`                                                         | -           |
-| `gdn_prefill_backend`          | string \| None         | `None`                                                              | -           |
-| `enable_lora`                  | boolean                | `False`                                                             | -           |
-| `max_lora_rank`                | integer                | `16`                                                                | -           |
-| `max_loras`                    | integer                | `8`                                                                 | -           |
-| `lora_modules`                 | list of string \| None | `None`                                                              | -           |
+| Parameter                      | Type                   | Default                                                             | Description                                                                             |
+| ------------------------------ | ---------------------- | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
+| `model`                        | string                 | `""`                                                                | -                                                                                       |
+| `seed`                         | integer                | `1`                                                                 | -                                                                                       |
+| `skip_tokenizer_init`          | boolean                | `False`                                                             | -                                                                                       |
+| `enforce_eager`                | boolean                | `False`                                                             | -                                                                                       |
+| `dtype`                        | string                 | `"bfloat16"`                                                        | -                                                                                       |
+| `distributed_executor_backend` | string                 | `"mp"`                                                              | -                                                                                       |
+| `max_num_seqs`                 | integer                | `256`                                                               | -                                                                                       |
+| `block_size`                   | integer                | `16`                                                                | -                                                                                       |
+| `cpu_offload_gb`               | float                  | `0`                                                                 | -                                                                                       |
+| `disable_sliding_window`       | boolean                | `True`                                                              | -                                                                                       |
+| `max_model_len`                | integer \| None        | `32768`                                                             | -                                                                                       |
+| `no_enable_chunked_prefill`    | boolean                | `False`                                                             | -                                                                                       |
+| `no_enable_prefix_caching`     | boolean                | `True`                                                              | -                                                                                       |
+| `gpu_memory_utilization`       | float                  | `0.9`                                                               | -                                                                                       |
+| `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -                                                                                       |
+| `enable_sleep_mode`            | boolean                | `False`                                                             | -                                                                                       |
+| `uvicorn_log_level`            | string                 | `"warning"`                                                         | -                                                                                       |
+| `gdn_prefill_backend`          | string \| None         | `None`                                                              | GDN prefill backend for hybrid models like Qwen3.5. **Choices:** `triton`, `flashinfer` |
+| `enable_lora`                  | boolean                | `False`                                                             | -                                                                                       |
+| `max_lora_rank`                | integer                | `16`                                                                | -                                                                                       |
+| `max_loras`                    | integer                | `8`                                                                 | -                                                                                       |
+| `lora_modules`                 | list of string \| None | `None`                                                              | -                                                                                       |
 
 (section-train-dataset)=
 
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
index 3b24e59b9..8742abe79 100644
--- a/docs/zh/cli_reference.md
+++ b/docs/zh/cli_reference.md
@@ -632,30 +632,30 @@ Configuration for vLLM runtime. Refer to:
 
 https://docs.vllm.ai/en/stable/api/index.html for detailed documentation.
 
-| Parameter                      | Type                   | Default                                                             | Description |
-| ------------------------------ | ---------------------- | ------------------------------------------------------------------- | ----------- |
-| `model`                        | string                 | `""`                                                                | -           |
-| `seed`                         | integer                | `1`                                                                 | -           |
-| `skip_tokenizer_init`          | boolean                | `False`                                                             | -           |
-| `enforce_eager`                | boolean                | `False`                                                             | -           |
-| `dtype`                        | string                 | `"bfloat16"`                                                        | -           |
-| `distributed_executor_backend` | string                 | `"mp"`                                                              | -           |
-| `max_num_seqs`                 | integer                | `256`                                                               | -           |
-| `block_size`                   | integer                | `16`                                                                | -           |
-| `cpu_offload_gb`               | float                  | `0`                                                                 | -           |
-| `disable_sliding_window`       | boolean                | `True`                                                              | -           |
-| `max_model_len`                | integer \| None        | `32768`                                                             | -           |
-| `no_enable_chunked_prefill`    | boolean                | `False`                                                             | -           |
-| `no_enable_prefix_caching`     | boolean                | `True`                                                              | -           |
-| `gpu_memory_utilization`       | float                  | `0.9`                                                               | -           |
-| `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -           |
-| `enable_sleep_mode`            | boolean                | `False`                                                             | -           |
-| `uvicorn_log_level`            | string                 | `"warning"`                                                         | -           |
-| `gdn_prefill_backend`          | string \| None         | `None`                                                              | -           |
-| `enable_lora`                  | boolean                | `False`                                                             | -           |
-| `max_lora_rank`                | integer                | `16`                                                                | -           |
-| `max_loras`                    | integer                | `8`                                                                 | -           |
-| `lora_modules`                 | list of string \| None | `None`                                                              | -           |
+| Parameter                      | Type                   | Default                                                             | Description                                                                             |
+| ------------------------------ | ---------------------- | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
+| `model`                        | string                 | `""`                                                                | -                                                                                       |
+| `seed`                         | integer                | `1`                                                                 | -                                                                                       |
+| `skip_tokenizer_init`          | boolean                | `False`                                                             | -                                                                                       |
+| `enforce_eager`                | boolean                | `False`                                                             | -                                                                                       |
+| `dtype`                        | string                 | `"bfloat16"`                                                        | -                                                                                       |
+| `distributed_executor_backend` | string                 | `"mp"`                                                              | -                                                                                       |
+| `max_num_seqs`                 | integer                | `256`                                                               | -                                                                                       |
+| `block_size`                   | integer                | `16`                                                                | -                                                                                       |
+| `cpu_offload_gb`               | float                  | `0`                                                                 | -                                                                                       |
+| `disable_sliding_window`       | boolean                | `True`                                                              | -                                                                                       |
+| `max_model_len`                | integer \| None        | `32768`                                                             | -                                                                                       |
+| `no_enable_chunked_prefill`    | boolean                | `False`                                                             | -                                                                                       |
+| `no_enable_prefix_caching`     | boolean                | `True`                                                              | -                                                                                       |
+| `gpu_memory_utilization`       | float                  | `0.9`                                                               | -                                                                                       |
+| `worker_extension_cls`         | string                 | `"areal.engine.vllm_ext.vllm_worker_extension.VLLMWorkerExtension"` | -                                                                                       |
+| `enable_sleep_mode`            | boolean                | `False`                                                             | -                                                                                       |
+| `uvicorn_log_level`            | string                 | `"warning"`                                                         | -                                                                                       |
+| `gdn_prefill_backend`          | string \| None         | `None`                                                              | GDN prefill backend for hybrid models like Qwen3.5. **Choices:** `triton`, `flashinfer` |
+| `enable_lora`                  | boolean                | `False`                                                             | -                                                                                       |
+| `max_lora_rank`                | integer                | `16`                                                                | -                                                                                       |
+| `max_loras`                    | integer                | `8`                                                                 | -                                                                                       |
+| `lora_modules`                 | list of string \| None | `None`                                                              | -                                                                                       |
 
 (section-train-dataset)=
 

From a3f73e7730b2c22e0a085d1e4e6c7000ed5def64 Mon Sep 17 00:00:00 2001
From: Rongzhi Gu <51925155+Adiactive@users.noreply.github.com>
Date: Wed, 3 Jun 2026 11:29:47 -0700
Subject: [PATCH 9/9] refactor(engine): auto-derive padded-seq layout from
 model type

The padded (BSHD) vs packed (THD) forward layout is a hard architectural
property of the model -- GDN/SSM kernels (the Qwen3.5 family) reject
packed sequences -- not a user tunable. Exposing it as the
`use_padded_seq` config field let it be mis-set and risked silent
correctness or crash issues. Derive it from `model_type` instead so the
layout can never disagree with the architecture.

Also surface a startup warning when `use_bridge_for_update_weights=True`
but a fallback condition (non-megatron-bridge, FP8/quantized, or LoRA)
silently routes live weight sync through the registry path, so the
effective behavior is visible in logs.

Key changes:
- Add requires_padded_seq(model_type) helper in engine/core/model.py
- Derive self.use_padded_seq from model_type in MegatronEngine.initialize
- Remove use_padded_seq from MegatronEngineConfig and regenerate CLI docs
- Warn once when bridge weight-sync falls back to the registry path
- Drop the test-runner override map and example yaml flag

Refs: #1384
---
 areal/api/cli_args.py                         | 12 ----
 areal/engine/core/model.py                    | 10 +++
 areal/engine/megatron_engine.py               | 37 ++++++++---
 docs/en/cli_reference.md                      | 61 +++++++++----------
 docs/zh/cli_reference.md                      | 61 +++++++++----------
 .../qwen3_5_2b_megatron_geometry3k_grpo.yaml  |  1 -
 tests/test_megatron_engine_distributed.py     |  7 ++-
 .../run_megatron_engine_distributed.py        |  8 ---
 8 files changed, 103 insertions(+), 94 deletions(-)

diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
index b3df4982b..983bd933f 100644
--- a/areal/api/cli_args.py
+++ b/areal/api/cli_args.py
@@ -950,18 +950,6 @@ class MegatronEngineConfig:
         },
     )
 
-    use_padded_seq: bool = field(
-        default=False,
-        metadata={
-            "help": "Force padded (BSHD) input layout instead of packed (THD) for "
-            "forward / train_batch. Required for architectures whose state-space "
-            "or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less "
-            "memory-efficient because attention computes over padding, so prefer "
-            "small per-microbatch sequence counts. Incompatible with "
-            "context_parallel_size > 1 (same constraint VLM has).",
-        },
-    )
-
 
 class SchedulingStrategyType(str, Enum):
     separation = "separation"
diff --git a/areal/engine/core/model.py b/areal/engine/core/model.py
index feecd6f81..3aab7ee4d 100644
--- a/areal/engine/core/model.py
+++ b/areal/engine/core/model.py
@@ -85,6 +85,16 @@ def is_qwen3_5_model(model_type: str) -> bool:
     return model_type in ["qwen3_5", "qwen3_5_text", "qwen3_5_moe", "qwen3_5_moe_text"]
 
 
+def requires_padded_seq(model_type: str) -> bool:
+    """Whether the model must run the padded (BSHD) forward instead of packed (THD).
+
+    GDN/SSM models (currently the Qwen3.5 family) reject packed sequences in their
+    attention/SSM kernels, so they must run on padded ``[B, S]`` input. THD stays
+    the default for every other model.
+    """
+    return is_qwen3_5_model(model_type)
+
+
 # Copied from trl
 def disable_dropout_in_model(model: torch.nn.Module) -> None:
     for module in model.modules():
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
index aeddd1326..841bc2926 100644
--- a/areal/engine/megatron_engine.py
+++ b/areal/engine/megatron_engine.py
@@ -60,6 +60,7 @@
     disable_dropout_in_model,
     is_valid_vision_model,
     lang_config,
+    requires_padded_seq,
 )
 from areal.engine.megatron_utils import megatron_bridge_patches  # noqa: F401
 from areal.engine.megatron_utils.checkpointer import MegatronCheckpointManager
@@ -347,6 +348,10 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
             )
 
             self.is_vision_model = is_valid_vision_model(self.hf_config.model_type)
+            # GDN/SSM models (e.g. Qwen3.5) reject packed THD input and must run
+            # the padded BSHD forward. Derived from model type rather than a
+            # config flag so the layout can't be mis-set.
+            self.use_padded_seq = requires_padded_seq(self.hf_config.model_type)
             if self.is_vision_model:
                 if self.parallel_strategy.context_parallel_size > 1:
                     raise NotImplementedError(
@@ -362,14 +367,12 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
                     f"Loaded processor and tokenizer."
                 )
 
-            if (
-                self.mcore_config.use_padded_seq
-                and self.parallel_strategy.context_parallel_size > 1
-            ):
+            if self.use_padded_seq and self.parallel_strategy.context_parallel_size > 1:
                 raise NotImplementedError(
-                    "Context parallel (CP > 1) is not supported with "
-                    "use_padded_seq=True (the padded BSHD path operates on "
-                    "[B, S] tensors and the CP path packs sequences). "
+                    f"Context parallel (CP > 1) is not supported for "
+                    f"model_type={self.hf_config.model_type!r}, which requires the "
+                    "padded BSHD forward (it operates on [B, S] tensors while the "
+                    "CP path packs sequences). "
                     f"Got context_parallel_size={self.parallel_strategy.context_parallel_size}."
                 )
 
@@ -380,6 +383,24 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
             self._check_and_apply_fp8_config()
             self._validate_fp8_consistency()
 
+            # Warn once if bridge-delegated weight sync was requested but a
+            # fallback condition forces the registry conversion path (the
+            # dispatch in _update_weights_from_distributed silently falls back).
+            if self.mcore_config.use_bridge_for_update_weights:
+                fallback_reasons = []
+                if self.bridge_cls != "megatron-bridge":
+                    fallback_reasons.append(f"bridge_type={self.bridge_cls!r}")
+                if self.quantization_config:
+                    fallback_reasons.append("FP8/quantized training")
+                if self.config.use_lora:
+                    fallback_reasons.append("LoRA enabled")
+                if fallback_reasons:
+                    self.logger.warning(
+                        "use_bridge_for_update_weights=True, but live weight sync "
+                        "will use the registry conversion path instead because: "
+                        f"{', '.join(fallback_reasons)}."
+                    )
+
             with self.device:
                 models = make_mcore_model(
                     hf_config=self.hf_config,
@@ -856,7 +877,7 @@ def forward_step(batch_iter, model):
                 mb_input.padded_mb,
                 gather_cp_output=not cp_local,
                 is_vision_model=self.is_vision_model,
-                use_padded_seq=self.mcore_config.use_padded_seq,
+                use_padded_seq=self.use_padded_seq,
             )
 
             # Release tree attention metadata after forward pass
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
index f45232452..2caed37ff 100644
--- a/docs/en/cli_reference.md
+++ b/docs/en/cli_reference.md
@@ -1064,37 +1064,36 @@ Configuration for Megatron-LM training framework.
 
 Refer to Megatron-LM documentation for implementation details.
 
-| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                                                                                                        |
-| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                                                                                                |
-| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                                                                                                  |
-| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints.                                                                               |
-| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                                                                                                            |
-| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                                                                                              |
-| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                                                                                                             |
-| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                                                                                              |
-| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                                                                                                  |
-| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                                                                                              |
-| `use_padded_seq`                           | boolean                                                              | `False`      | Force padded (BSHD) input layout instead of packed (THD) for forward / train_batch. Required for architectures whose state-space or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less memory-efficient because attention computes over padding, so prefer small per-microbatch sequence counts. Incompatible with context_parallel_size > 1 (same constraint VLM has). |
+| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                          |
+| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
+| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                    |
+| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                  |
+| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                    |
+| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints. |
+| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
+| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                    |
+| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                    |
+| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                              |
+| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                |
+| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                               |
+| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
+| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
 
 (section-memory-profiler)=
 
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
index 8742abe79..3405baae8 100644
--- a/docs/zh/cli_reference.md
+++ b/docs/zh/cli_reference.md
@@ -1062,37 +1062,36 @@ Configuration for Megatron-LM training framework.
 
 Refer to Megatron-LM documentation for implementation details.
 
-| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                                                                                                        |
-| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                                                                                                |
-| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                                                                                                  |
-| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints.                                                                               |
-| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                                                                                                            |
-| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                                                                                              |
-| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                                                                                                             |
-| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                                                                                                  |
-| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                                                                                              |
-| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                                                                                                  |
-| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                                                                                              |
-| `use_padded_seq`                           | boolean                                                              | `False`      | Force padded (BSHD) input layout instead of packed (THD) for forward / train_batch. Required for architectures whose state-space or SSM layers reject packed sequences (e.g. Qwen3.5's GDN). Less memory-efficient because attention computes over padding, so prefer small per-microbatch sequence counts. Incompatible with context_parallel_size > 1 (same constraint VLM has). |
+| Parameter                                  | Type                                                                 | Default      | Description                                                                                                                                                                                                                                                                                          |
+| ------------------------------------------ | -------------------------------------------------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `wrap_with_ddp`                            | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
+| `use_torch_fsdp2`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `use_custom_fsdp`                          | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `ddp`                                      | [`DistributedDataParallelConfig`](section-distributed-data-parallel) | **Required** | -                                                                                                                                                                                                                                                                                                    |
+| `virtual_pipeline_parallel_size`           | integer                                                              | `1`          | Virtual pipeline parallel size for Megatron interleaved schedule. Set to >1 to enable VPP. Default is 1 (disabled).                                                                                                                                                                                  |
+| `overlap_param_gather_with_optimizer_step` | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `use_precision_aware_optimizer`            | boolean                                                              | `False`      | Enable precision-aware optimizer for Megatron. When using adam_bf16 optimizer type with Megatron Engine, this is automatically enabled with exp_avg_dtype=bfloat16 and exp_avg_sq_dtype=bfloat16.                                                                                                    |
+| `main_grads_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `main_params_dtype`                        | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `exp_avg_dtype`                            | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `exp_avg_sq_dtype`                         | string                                                               | `"float32"`  | -                                                                                                                                                                                                                                                                                                    |
+| `async_save`                               | boolean                                                              | `False`      | If True, Megatron checkpoint saves run in background processes and save_checkpoint() returns immediately after weights are durably staged off the GPU. Pending saves are drained before the next load_checkpoint() and during engine.destroy(). Reduces per-save sync wait on large MoE checkpoints. |
+| `use_checkpoint_opt_param_scheduler`       | boolean                                                              | `True`       | -                                                                                                                                                                                                                                                                                                    |
+| `use_deterministic_algorithms`             | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_granularity`                    | string \| None                                                       | `"full"`     | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_method`                         | string \| None                                                       | `"uniform"`  | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_num_layers`                     | integer \| None                                                      | `1`          | -                                                                                                                                                                                                                                                                                                    |
+| `distribute_saved_activations`             | boolean \| None                                                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `recompute_modules`                        | list of string \| None                                               | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `moe_router_dtype`                         | string \| None                                                       | `"fp32"`     | -                                                                                                                                                                                                                                                                                                    |
+| `moe_shared_expert_overlap`                | boolean                                                              | `False`      | Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared experts execute after the routed experts.                                                                                                                                              |
+| `moe_enable_deepep`                        | boolean                                                              | `False`      | -                                                                                                                                                                                                                                                                                                    |
+| `moe_token_dispatcher_type`                | string                                                               | `"alltoall"` | Type of token dispatcher. Options: 'allgather','alltoall' and 'flex'.                                                                                                                                                                                                                                |
+| `moe_permute_fusion`                       | boolean                                                              | `False`      | Fuse token rearrangement ops during token dispatching.                                                                                                                                                                                                                                               |
+| `fp8_config`                               | [`FP8EngineConfig`](section-fp8-engine) \| None                      | `None`       | -                                                                                                                                                                                                                                                                                                    |
+| `bridge_type`                              | string                                                               | `"mbridge"`  | Bridge backend for MegatronEngine. Choices: 'mbridge' or 'megatron-bridge'. **Choices:** `mbridge`, `megatron-bridge`                                                                                                                                                                                |
+| `use_mbridge_save`                         | boolean                                                              | `False`      | Use mbridge's save method to save gpu memory when saving weights.                                                                                                                                                                                                                                    |
+| `use_bridge_for_update_weights`            | boolean                                                              | `False`      | When True and bridge_type='megatron-bridge', delegate live weight sync to bridge.export_hf_weights instead of the hand-rolled convert_to_hf registry. Required for models without a registry entry (e.g. Qwen3.5). FP8 paths fall back to the registry automatically.                                |
 
 (section-memory-profiler)=
 
diff --git a/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml b/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml
index 9bd79cd64..6239ec956 100644
--- a/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml
+++ b/examples/vlm/qwen3_5_2b_megatron_geometry3k_grpo.yaml
@@ -79,7 +79,6 @@ actor:
     std_level: batch
   megatron:
     bridge_type: megatron-bridge
-    use_padded_seq: true # required for GDN forward
     use_bridge_for_update_weights: true # for live weight sync
   max_new_tokens: ${gconfig.max_new_tokens}
   scheduling_spec:
diff --git a/tests/test_megatron_engine_distributed.py b/tests/test_megatron_engine_distributed.py
index c0ef5cebb..5f3766cc4 100644
--- a/tests/test_megatron_engine_distributed.py
+++ b/tests/test_megatron_engine_distributed.py
@@ -222,7 +222,7 @@ def test_qwen3_5_virtual_pipeline_parallel(tmp_path_factory):
 @pytest.mark.multi_gpu
 @pytest.mark.slow
 @pytest.mark.skip(
-    reason="BSHD mode (use_padded_seq) lacks microbatch invariance: padding "
+    reason="BSHD mode (padded forward) lacks microbatch invariance: padding "
     "changes per MB boundary cause small grad_norm drift. verl sidesteps "
     "this by setting ppo_micro_batch_size_per_gpu=1 (1 seq/MB, no padding "
     "diff). See run_qwen3_5_35b_megatron.sh for the recommended config."
@@ -268,8 +268,9 @@ def test_qwen3_5_hf_save_load(tmp_path_factory):
 
 # ──────────────────────────────────────────────────────────────────────
 # Qwen3.5 MoE tests. Same GDN hybrid attention as dense Qwen3.5 (routed through
-# bridge_type=megatron-bridge + use_padded_seq via the runner's override maps),
-# plus a Mixture-of-Experts FFN exercised with expert parallelism.
+# bridge_type=megatron-bridge via the runner's override map; the padded BSHD
+# forward is auto-derived from model_type), plus a Mixture-of-Experts FFN
+# exercised with expert parallelism.
 #
 # Parallelism constraints for this model:
 #   * Context parallel is unavailable for the Qwen3.5 series (GDN/SSM layers
diff --git a/tests/torchrun/run_megatron_engine_distributed.py b/tests/torchrun/run_megatron_engine_distributed.py
index 64943f50d..2384d50a9 100644
--- a/tests/torchrun/run_megatron_engine_distributed.py
+++ b/tests/torchrun/run_megatron_engine_distributed.py
@@ -42,12 +42,6 @@
     "qwen3_5_moe": "megatron-bridge",
 }
 
-# Models whose GDN/SSM kernels reject packed (THD) inputs must run forward
-# on padded [B, S] BSHD tensors. The engine reconstructs the 2D form
-# internally from cu_seqlens; no caller-side input change needed. Both the
-# dense and MoE qwen3_5 variants share the GDN attention layers.
-_MODEL_PADDED_SEQ_OVERRIDES = {"qwen3_5": True, "qwen3_5_moe": True}
-
 # Models large enough that a full-AdamW optimizer state does not fit even when
 # sharded (Qwen3.5-35B-A3B's optimizer state is ~420GB, exceeding 8x80GB with
 # params/grads/activations) skip the train step in the HF save/load round-trip.
@@ -106,7 +100,6 @@ def mock_input(
 
 def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
     bridge_type = _MODEL_BRIDGE_OVERRIDES.get(model_type, "mbridge")
-    use_padded_seq = _MODEL_PADDED_SEQ_OVERRIDES.get(model_type, False)
     config = TrainEngineConfig(
         backend=backend,
         experiment_name="test",
@@ -117,7 +110,6 @@ def make_engine(model_type, backend, mb_spec, vpp_size=1, init_optimizer=False):
         megatron=MegatronEngineConfig(
             virtual_pipeline_parallel_size=vpp_size,
             bridge_type=bridge_type,
-            use_padded_seq=use_padded_seq,
         ),
     )
     alloc_mode = ModelAllocation.from_str(backend)