Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,8 @@
title: Apertus
- local: model_doc/arcee
title: Arcee
- local: model_doc/bailing2_5_moe
title: BailingMoeV2_5
- local: model_doc/bamba
title: Bamba
- local: model_doc/bart
Expand Down
72 changes: 72 additions & 0 deletions docs/source/en/model_doc/bailing2_5_moe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
<!--Copyright 2026 The HuggingFace Team. All rights reserved.

for others as well please


Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->
*This model was contributed to Hugging Face Transformers on 2026-06-23.*

# BailingMoeV2_5

## Overview

The BailingMoeV2_5 model (Ling/Ring 2.6 series, e.g. Ling-2.6-flash) was proposed by [InclusionAI](https://huggingface.co/inclusionAI). It is based on a hybrid linear attention architecture, combining Multi-head Latent Attention (MLA), Lightning Linear Attention, and Mixture of Experts (MoE).

Key architectural features:
- **Hybrid Attention**: Uses a 1:7 ratio of MLA to Lightning Linear Attention layers, achieving near-linear computational complexity
- **Multi-head Latent Attention (MLA)**: Similar to DeepSeek-V3, with compressed KV cache via LoRA projections
- **Lightning Linear Attention**: Based on SimpleGLA (Simple Gated Linear Attention) from the flash-linear-attention library
- **Mixture of Experts**: 256 routed experts with 8 active per token, plus shared experts

### Usage tips

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
"inclusionAI/Ling-2.6-flash-base",
device_map="auto",
dtype=torch.bfloat16,

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dtype=torch.bfloat16,

shouldnt be needed we use auto as default

)
tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ling-2.6-flash-base")

inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

For optimal performance with the linear attention layers, install the [flash-linear-attention](https://github.com/fla-org/flash-linear-attention) library. Without it, the model falls back to a pure PyTorch implementation.

## BailingMoeV2_5Config

[[autodoc]] BailingMoeV2_5Config

## BailingMoeV2_5Model

[[autodoc]] BailingMoeV2_5Model
- forward

## BailingMoeV2_5ForCausalLM

[[autodoc]] BailingMoeV2_5ForCausalLM
- forward

## BailingMoeV2_5ForSequenceClassification

[[autodoc]] BailingMoeV2_5ForSequenceClassification
- forward

## BailingMoeV2_5ForTokenClassification

[[autodoc]] BailingMoeV2_5ForTokenClassification
- forward
39 changes: 39 additions & 0 deletions src/transformers/conversion_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,31 @@ def _build_checkpoint_conversion_mapping():
WeightRenaming(source_patterns=r"\.self_attn\.norm_q\.", target_patterns=".self_attn.q_norm."),
WeightRenaming(source_patterns=r"\.self_attn\.norm_k\.", target_patterns=".self_attn.k_norm."),
],
"bailing2_5_moe": [
# Embedding rename.
WeightRenaming(r"word_embeddings", "embed_tokens"),
# NOTE: full-attention (MLA) layer indices (where (i + 1) % layer_group_size == 0)
# are injected dynamically in `extract_weight_conversions_for_model` based on the
# model config, so the mapping works for any num_hidden_layers / layer_group_size.
WeightRenaming(r"\.attention\.", ".linear_attn."),
WeightRenaming(r"\.dense\.weight", ".o_proj.weight"),
# MoE router bias rename.
WeightRenaming(r"mlp\.gate\.expert_bias", "mlp.gate.e_score_correction_bias"),
# Pack per-expert gate_proj and up_proj into a single 3D tensor.
WeightConverter(
source_patterns=[
"mlp.experts.*.gate_proj.weight",
"mlp.experts.*.up_proj.weight",
],
target_patterns="mlp.experts.gate_up_proj",
operations=[MergeModulelist(dim=0), Concatenate(dim=1)],
),
WeightConverter(
source_patterns="mlp.experts.*.down_proj.weight",
target_patterns="mlp.experts.down_proj",
operations=[MergeModulelist(dim=0)],
),
],
"phimoe": [
WeightRenaming(".block_sparse_moe.", ".mlp."),
WeightRenaming(".gate.weight", ".router.weight"),
Expand Down Expand Up @@ -1516,6 +1541,20 @@ def extract_weight_conversions_for_model(
conversions = get_checkpoint_conversion_mapping(class_name)
if conversions is None and model_type:
conversions = get_checkpoint_conversion_mapping(model_type)

if model_type == "bailing2_5_moe" and conversions is not None:
# Inject `attention -> self_attn` renames for full-attention layer indices,
# derived from the model config rather than hardcoded.
num_hidden_layers = getattr(model.config, "num_hidden_layers", 0)
layer_group_size = getattr(model.config, "layer_group_size", 0) or 0
if layer_group_size > 0:
full_attn_layers = [i for i in range(num_hidden_layers) if (i + 1) % layer_group_size == 0]
self_attn_renames = [
WeightRenaming(rf"layers\.{i}\.attention\.", f"layers.{i}.self_attn.") for i in full_attn_layers

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, imo I also don't mind to have the same naming internally. this is very awkward so would like to avoid this

]
# These must run before the generic `.attention. -> .linear_attn.` rule.
conversions = self_attn_renames + conversions

return conversions


Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from .auto import *
from .autoformer import *
from .aya_vision import *
from .bailing2_5_moe import *
from .bamba import *
from .bark import *
from .bart import *
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/auto_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
("audioflamingo3_encoder", "AudioFlamingo3EncoderConfig"),
("autoformer", "AutoformerConfig"),
("aya_vision", "AyaVisionConfig"),
("bailing2_5_moe", "BailingMoeV2_5Config"),
("bamba", "BambaConfig"),
("bark", "BarkConfig"),
("bart", "BartConfig"),
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/auto/modeling_auto.py

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure: The tokenizers backend is used for this model so we don't need an entry to tokenization auto?

Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
("autoformer", "AutoformerModel"),
("aya_vision", "AyaVisionModel"),
("bailing2_5_moe", "BailingMoeV2_5Model"),
("bamba", "BambaModel"),
("bark", "BarkModel"),
("bart", "BartModel"),
Expand Down Expand Up @@ -647,6 +648,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("apertus", "ApertusForCausalLM"),
("arcee", "ArceeForCausalLM"),
("aria_text", "AriaTextForCausalLM"),
("bailing2_5_moe", "BailingMoeV2_5ForCausalLM"),
("bamba", "BambaForCausalLM"),
("bart", "BartForCausalLM"),
("bert", "BertLMHeadModel"),
Expand Down Expand Up @@ -1301,6 +1303,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
# Model for Sequence Classification mapping
("albert", "AlbertForSequenceClassification"),
("arcee", "ArceeForSequenceClassification"),
("bailing2_5_moe", "BailingMoeV2_5ForSequenceClassification"),
("bart", "BartForSequenceClassification"),
("bert", "BertForSequenceClassification"),
("big_bird", "BigBirdForSequenceClassification"),
Expand Down Expand Up @@ -1533,6 +1536,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("albert", "AlbertForTokenClassification"),
("apertus", "ApertusForTokenClassification"),
("arcee", "ArceeForTokenClassification"),
("bailing2_5_moe", "BailingMoeV2_5ForTokenClassification"),
("bert", "BertForTokenClassification"),
("big_bird", "BigBirdForTokenClassification"),
("biogpt", "BioGptForTokenClassification"),
Expand Down
27 changes: 27 additions & 0 deletions src/transformers/models/bailing2_5_moe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure


if TYPE_CHECKING:
from .configuration_bailing2_5_moe import *
from .modeling_bailing2_5_moe import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
162 changes: 162 additions & 0 deletions src/transformers/models/bailing2_5_moe/configuration_bailing2_5_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Copyright 2025 InclusionAI and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BailingMoeV2_5 model configuration"""

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring


@auto_docstring(checkpoint="inclusionAI/Ling-2.6-flash-base")
@strict
class BailingMoeV2_5Config(PreTrainedConfig):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imo we can move this to modular and inherit from somethin like deepseek v2/3?

r"""
layer_group_size (`int`, *optional*, defaults to 8):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be set via layer types instead, we could handle this in the post init

Controls the hybrid layer pattern. Every `layer_group_size`-th layer uses full MLA attention,
while the rest use lightning linear attention.
n_group (`int`, *optional*, defaults to 8):
Number of groups for routed experts in group-limited-greedy routing.
first_k_dense_replace (`int`, *optional*, defaults to 4):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mlp layer types

Number of initial dense layers before switching to MoE.
rope_interleave (`bool`, *optional*, defaults to `True`):
Whether to interleave the rotary position embeddings.
group_norm_size (`int`, *optional*, defaults to 8):
Group size for group RMS normalization in linear attention layers.
num_kv_heads_for_linear_attn (`int`, *optional*, defaults to 64):
Number of key-value heads used in linear attention layers.
Comment on lines +38 to +39

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: rename to linear_key_value_heads

linear_silu (`bool`, *optional*, defaults to `False`):
Whether to apply SiLU activation on the gate in linear attention.
moe_shared_expert_intermediate_size (`int`, *optional*, defaults to 2048):
Intermediate size of the shared expert in MoE layers.
topk_method (`str`, *optional*, defaults to `"noaux_tc"`):
Method for selecting top-k experts in the MoE layer.
scoring_func (`str`, *optional*, defaults to `"sigmoid"`):
Scoring function for the router in the MoE layer.
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
Fraction of the head dimension to apply rotary position embeddings in linear attention layers.
Comment on lines +48 to +49

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be in the rope parameters not within a "normal" attr

router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
Coefficient for the auxiliary load balancing loss from the router.

Example:

```python
>>> from transformers import BailingMoeV2_5Model, BailingMoeV2_5Config

>>> # Initializing a BailingMoeV2_5 style configuration
>>> configuration = BailingMoeV2_5Config()

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "bailing2_5_moe"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could also include the MLA here no?

"layers.*.mlp.experts.gate_up_proj": "packed_colwise",
"layers.*.mlp.experts.down_proj": "rowwise",
"layers.*.mlp.experts": "moe_tp_experts",
"layers.*.mlp.shared_experts.gate_proj": "colwise",
"layers.*.mlp.shared_experts.up_proj": "colwise",
"layers.*.mlp.shared_experts.down_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
attribute_map = {
"num_local_experts": "num_experts",
}

vocab_size: int = 157184
hidden_size: int = 8192
intermediate_size: int = 18432
moe_intermediate_size: int = 2048
moe_shared_expert_intermediate_size: int = 2048
num_hidden_layers: int = 80
num_attention_heads: int = 64
num_key_value_heads: int | None = 64
num_experts: int = 256
num_shared_experts: int = 1
num_experts_per_tok: int | None = 8
routed_scaling_factor: float = 2.5
kv_lora_rank: int = 512
q_lora_rank: int | None = 1536
qk_rope_head_dim: int = 64
v_head_dim: int | None = 128
qk_nope_head_dim: int = 128
n_group: int | None = 8
topk_group: int | None = 4
topk_method: str = "noaux_tc"
scoring_func: str = "sigmoid"
first_k_dense_replace: int | None = 4
norm_topk_prob: bool | None = True
layer_group_size: int = 8
group_norm_size: int = 8
num_kv_heads_for_linear_attn: int = 64
linear_silu: bool = False
hidden_act: str = "silu"
max_position_embeddings: int = 131072
initializer_range: float = 0.02
rms_norm_eps: float = 1e-6
use_cache: bool = True
pad_token_id: int | None = 156892
bos_token_id: int | None = None
eos_token_id: int | list[int] | None = 156892
tie_word_embeddings: bool = False
rope_parameters: RopeParameters | dict | None = None
rope_interleave: bool | None = True
partial_rotary_factor: float = 0.5
attention_bias: bool = False
attention_dropout: float | int | None = 0.0
use_qk_norm: bool = True
output_router_logits: bool = False
router_aux_loss_coef: float = 0.001
layer_types: list[str] | None = None

def __post_init__(self, **kwargs):
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads

self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
self.head_dim = self.qk_rope_head_dim

if self.layer_types is None:
self.layer_types = [
"full_attention" if (i + 1) % self.layer_group_size == 0 else "linear_attention"
for i in range(self.num_hidden_layers)
]

super().__post_init__(**kwargs)

def convert_rope_params_to_dict(self, **kwargs):
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or self.rope_parameters
self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {}

self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta))
self.standardize_rope_params()

for key in ["beta_fast", "beta_slow", "factor"]:
if key in self.rope_parameters:
self.rope_parameters[key] = float(self.rope_parameters[key])
return kwargs


__all__ = ["BailingMoeV2_5Config"]
Loading