huggingface · kemuxiaozi000 · Jun 17, 2026 · Jun 17, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -501,6 +501,8 @@
         title: Apertus
       - local: model_doc/arcee
         title: Arcee
+      - local: model_doc/bailing2_5_moe
+        title: BailingMoeV2_5
       - local: model_doc/bamba
         title: Bamba
       - local: model_doc/bart

diff --git a/docs/source/en/model_doc/bailing2_5_moe.md b/docs/source/en/model_doc/bailing2_5_moe.md
@@ -0,0 +1,72 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was contributed to Hugging Face Transformers on 2026-06-23.*
+
+# BailingMoeV2_5
+
+## Overview
+
+The BailingMoeV2_5 model (Ling/Ring 2.6 series, e.g. Ling-2.6-flash) was proposed by [InclusionAI](https://huggingface.co/inclusionAI). It is based on a hybrid linear attention architecture, combining Multi-head Latent Attention (MLA), Lightning Linear Attention, and Mixture of Experts (MoE).
+
+Key architectural features:
+- **Hybrid Attention**: Uses a 1:7 ratio of MLA to Lightning Linear Attention layers, achieving near-linear computational complexity
+- **Multi-head Latent Attention (MLA)**: Similar to DeepSeek-V3, with compressed KV cache via LoRA projections
+- **Lightning Linear Attention**: Based on SimpleGLA (Simple Gated Linear Attention) from the flash-linear-attention library
+- **Mixture of Experts**: 256 routed experts with 8 active per token, plus shared experts
+
+### Usage tips
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+model = AutoModelForCausalLM.from_pretrained(
+    "inclusionAI/Ling-2.6-flash-base",
+    device_map="auto",
+    dtype=torch.bfloat16,
-    dtype=torch.bfloat16,
-    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ling-2.6-flash-base")
+
+inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=50)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+For optimal performance with the linear attention layers, install the [flash-linear-attention](https://github.com/fla-org/flash-linear-attention) library. Without it, the model falls back to a pure PyTorch implementation.
+
+## BailingMoeV2_5Config
+
+[[autodoc]] BailingMoeV2_5Config
+
+## BailingMoeV2_5Model
+
+[[autodoc]] BailingMoeV2_5Model
+    - forward
+
+## BailingMoeV2_5ForCausalLM
+
+[[autodoc]] BailingMoeV2_5ForCausalLM
+    - forward
+
+## BailingMoeV2_5ForSequenceClassification
+
+[[autodoc]] BailingMoeV2_5ForSequenceClassification
+    - forward
+
+## BailingMoeV2_5ForTokenClassification
+
+[[autodoc]] BailingMoeV2_5ForTokenClassification
+    - forward
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -868,6 +868,31 @@ def _build_checkpoint_conversion_mapping():
             WeightRenaming(source_patterns=r"\.self_attn\.norm_q\.", target_patterns=".self_attn.q_norm."),
             WeightRenaming(source_patterns=r"\.self_attn\.norm_k\.", target_patterns=".self_attn.k_norm."),
         ],
+        "bailing2_5_moe": [
+            # Embedding rename.
+            WeightRenaming(r"word_embeddings", "embed_tokens"),
+            # NOTE: full-attention (MLA) layer indices (where (i + 1) % layer_group_size == 0)
+            # are injected dynamically in `extract_weight_conversions_for_model` based on the
+            # model config, so the mapping works for any num_hidden_layers / layer_group_size.
+            WeightRenaming(r"\.attention\.", ".linear_attn."),
+            WeightRenaming(r"\.dense\.weight", ".o_proj.weight"),
+            # MoE router bias rename.
+            WeightRenaming(r"mlp\.gate\.expert_bias", "mlp.gate.e_score_correction_bias"),
+            # Pack per-expert gate_proj and up_proj into a single 3D tensor.
+            WeightConverter(
+                source_patterns=[
+                    "mlp.experts.*.gate_proj.weight",
+                    "mlp.experts.*.up_proj.weight",
+                ],
+                target_patterns="mlp.experts.gate_up_proj",
+                operations=[MergeModulelist(dim=0), Concatenate(dim=1)],
+            ),
+            WeightConverter(
+                source_patterns="mlp.experts.*.down_proj.weight",
+                target_patterns="mlp.experts.down_proj",
+                operations=[MergeModulelist(dim=0)],
+            ),
+        ],
         "phimoe": [
             WeightRenaming(".block_sparse_moe.", ".mlp."),
             WeightRenaming(".gate.weight", ".router.weight"),
@@ -1516,6 +1541,20 @@ def extract_weight_conversions_for_model(
     conversions = get_checkpoint_conversion_mapping(class_name)
     if conversions is None and model_type:
         conversions = get_checkpoint_conversion_mapping(model_type)
+
+    if model_type == "bailing2_5_moe" and conversions is not None:
+        # Inject `attention -> self_attn` renames for full-attention layer indices,
+        # derived from the model config rather than hardcoded.
+        num_hidden_layers = getattr(model.config, "num_hidden_layers", 0)
+        layer_group_size = getattr(model.config, "layer_group_size", 0) or 0
+        if layer_group_size > 0:
+            full_attn_layers = [i for i in range(num_hidden_layers) if (i + 1) % layer_group_size == 0]
+            self_attn_renames = [
+                WeightRenaming(rf"layers\.{i}\.attention\.", f"layers.{i}.self_attn.") for i in full_attn_layers
+            ]
+            # These must run before the generic `.attention. -> .linear_attn.` rule.
+            conversions = self_attn_renames + conversions
+
     return conversions
 
 

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -31,6 +31,7 @@
     from .auto import *
     from .autoformer import *
     from .aya_vision import *
+    from .bailing2_5_moe import *
     from .bamba import *
     from .bark import *
     from .bart import *

diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py
@@ -43,6 +43,7 @@
         ("audioflamingo3_encoder", "AudioFlamingo3EncoderConfig"),
         ("autoformer", "AutoformerConfig"),
         ("aya_vision", "AyaVisionConfig"),
+        ("bailing2_5_moe", "BailingMoeV2_5Config"),
         ("bamba", "BambaConfig"),
         ("bark", "BarkConfig"),
         ("bart", "BartConfig"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -56,6 +56,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
         ("autoformer", "AutoformerModel"),
         ("aya_vision", "AyaVisionModel"),
+        ("bailing2_5_moe", "BailingMoeV2_5Model"),
         ("bamba", "BambaModel"),
         ("bark", "BarkModel"),
         ("bart", "BartModel"),
@@ -647,6 +648,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("apertus", "ApertusForCausalLM"),
         ("arcee", "ArceeForCausalLM"),
         ("aria_text", "AriaTextForCausalLM"),
+        ("bailing2_5_moe", "BailingMoeV2_5ForCausalLM"),
         ("bamba", "BambaForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
@@ -1301,6 +1303,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         # Model for Sequence Classification mapping
         ("albert", "AlbertForSequenceClassification"),
         ("arcee", "ArceeForSequenceClassification"),
+        ("bailing2_5_moe", "BailingMoeV2_5ForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
         ("big_bird", "BigBirdForSequenceClassification"),
@@ -1533,6 +1536,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("albert", "AlbertForTokenClassification"),
         ("apertus", "ApertusForTokenClassification"),
         ("arcee", "ArceeForTokenClassification"),
+        ("bailing2_5_moe", "BailingMoeV2_5ForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("big_bird", "BigBirdForTokenClassification"),
         ("biogpt", "BioGptForTokenClassification"),

diff --git a/src/transformers/models/bailing2_5_moe/__init__.py b/src/transformers/models/bailing2_5_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_bailing2_5_moe import *
+    from .modeling_bailing2_5_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/bailing2_5_moe/configuration_bailing2_5_moe.py b/src/transformers/models/bailing2_5_moe/configuration_bailing2_5_moe.py
@@ -0,0 +1,162 @@
+# Copyright 2025 InclusionAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BailingMoeV2_5 model configuration"""
+
+from huggingface_hub.dataclasses import strict
+
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+
+
+@auto_docstring(checkpoint="inclusionAI/Ling-2.6-flash-base")
+@strict
+class BailingMoeV2_5Config(PreTrainedConfig):
+    r"""
+    layer_group_size (`int`, *optional*, defaults to 8):
+        Controls the hybrid layer pattern. Every `layer_group_size`-th layer uses full MLA attention,
+        while the rest use lightning linear attention.
+    n_group (`int`, *optional*, defaults to 8):
+        Number of groups for routed experts in group-limited-greedy routing.
+    first_k_dense_replace (`int`, *optional*, defaults to 4):
+        Number of initial dense layers before switching to MoE.
+    rope_interleave (`bool`, *optional*, defaults to `True`):
+        Whether to interleave the rotary position embeddings.
+    group_norm_size (`int`, *optional*, defaults to 8):
+        Group size for group RMS normalization in linear attention layers.
+    num_kv_heads_for_linear_attn (`int`, *optional*, defaults to 64):
+        Number of key-value heads used in linear attention layers.
+    linear_silu (`bool`, *optional*, defaults to `False`):
+        Whether to apply SiLU activation on the gate in linear attention.
+    moe_shared_expert_intermediate_size (`int`, *optional*, defaults to 2048):
+        Intermediate size of the shared expert in MoE layers.
+    topk_method (`str`, *optional*, defaults to `"noaux_tc"`):
+        Method for selecting top-k experts in the MoE layer.
+    scoring_func (`str`, *optional*, defaults to `"sigmoid"`):
+        Scoring function for the router in the MoE layer.
+    partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+        Fraction of the head dimension to apply rotary position embeddings in linear attention layers.
+    router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+        Coefficient for the auxiliary load balancing loss from the router.
+
+    Example:
+
+    ```python
+    >>> from transformers import BailingMoeV2_5Model, BailingMoeV2_5Config
+
+    >>> # Initializing a BailingMoeV2_5 style configuration
+    >>> configuration = BailingMoeV2_5Config()
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "bailing2_5_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.experts": "moe_tp_experts",
+        "layers.*.mlp.shared_experts.gate_proj": "colwise",
+        "layers.*.mlp.shared_experts.up_proj": "colwise",
+        "layers.*.mlp.shared_experts.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    attribute_map = {
+        "num_local_experts": "num_experts",
+    }
+
+    vocab_size: int = 157184
+    hidden_size: int = 8192
+    intermediate_size: int = 18432
+    moe_intermediate_size: int = 2048
+    moe_shared_expert_intermediate_size: int = 2048
+    num_hidden_layers: int = 80
+    num_attention_heads: int = 64
+    num_key_value_heads: int | None = 64
+    num_experts: int = 256
+    num_shared_experts: int = 1
+    num_experts_per_tok: int | None = 8
+    routed_scaling_factor: float = 2.5
+    kv_lora_rank: int = 512
+    q_lora_rank: int | None = 1536
+    qk_rope_head_dim: int = 64
+    v_head_dim: int | None = 128
+    qk_nope_head_dim: int = 128
+    n_group: int | None = 8
+    topk_group: int | None = 4
+    topk_method: str = "noaux_tc"
+    scoring_func: str = "sigmoid"
+    first_k_dense_replace: int | None = 4
+    norm_topk_prob: bool | None = True
+    layer_group_size: int = 8
+    group_norm_size: int = 8
+    num_kv_heads_for_linear_attn: int = 64
+    linear_silu: bool = False
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 131072
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = 156892
+    bos_token_id: int | None = None
+    eos_token_id: int | list[int] | None = 156892
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    rope_interleave: bool | None = True
+    partial_rotary_factor: float = 0.5
+    attention_bias: bool = False
+    attention_dropout: float | int | None = 0.0
+    use_qk_norm: bool = True
+    output_router_logits: bool = False
+    router_aux_loss_coef: float = 0.001
+    layer_types: list[str] | None = None
+
+    def __post_init__(self, **kwargs):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        self.head_dim = self.qk_rope_head_dim
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "full_attention" if (i + 1) % self.layer_group_size == 0 else "linear_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+
+        super().__post_init__(**kwargs)
+
+    def convert_rope_params_to_dict(self, **kwargs):
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        self.rope_parameters = rope_scaling or self.rope_parameters
+        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {}
+
+        self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta))
+        self.standardize_rope_params()
+
+        for key in ["beta_fast", "beta_slow", "factor"]:
+            if key in self.rope_parameters:
+                self.rope_parameters[key] = float(self.rope_parameters[key])
+        return kwargs
+
+
+__all__ = ["BailingMoeV2_5Config"]