huggingface · Rocketknight1 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -601,6 +601,10 @@
         title: Ernie4_5_MoE
       - local: model_doc/esm
         title: ESM
+      - local: model_doc/esmc
+        title: ESMC
+      - local: model_doc/esmfold2
+        title: ESMFold2
       - local: model_doc/eurobert
         title: EuroBERT
       - local: model_doc/exaone4

diff --git a/docs/source/en/model_doc/esmc.md b/docs/source/en/model_doc/esmc.md
@@ -0,0 +1,102 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was contributed to Hugging Face Transformers on 2026-07-02.*
+
+# ESMC
+
+## Overview
+
+ESMC (ESM Cambrian) is a family of protein language models released by [BioHub](https://biohub.org/).
+It is a bidirectional Transformer encoder trained with a masked-language-modelling objective over amino-acid sequences.
+Like [ESM-2](./esm), ESMC produces per-residue representations that are useful for downstream protein modelling tasks.
+
+ESMC is suitable for fine-tuning on protein classification or token classification tasks. It is also used as the
+backbone of [ESMFold2](./esmfold2), where it generates representations that are used as input to the folding head.
+
+Pre-trained checkpoints are available on the Hugging Face Hub:
+
+- [`biohub/ESMC-300M`](https://huggingface.co/biohub/ESMC-300M)
+- [`biohub/ESMC-600M`](https://huggingface.co/biohub/ESMC-600M)
+- [`biohub/ESMC-6B`](https://huggingface.co/biohub/ESMC-6B)
+
+## Usage example
+
+ESMC is registered with the auto classes (`AutoModel`, `AutoModelForMaskedLM`,
+`AutoModelForSequenceClassification`, `AutoModelForTokenClassification`).
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```python
+import torch
+from transformers import pipeline
+
+extractor = pipeline(
+    task="feature-extraction",
+    model="biohub/ESMC-300M",
+)
+# Per-residue representations of shape (batch, sequence_length, hidden_size).
+representations = extractor("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ", return_tensors="pt")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("biohub/ESMC-300M")
+model = AutoModel.from_pretrained("biohub/ESMC-300M")
+
+inputs = tokenizer("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ", return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Per-residue representations of shape (batch, sequence_length, hidden_size).
+representations = outputs.last_hidden_state
+```
+
+</hfoption>
+</hfoptions>
+
+## ESMCConfig
+
+[[autodoc]] ESMCConfig
+
+## ESMCTokenizer
+
+[[autodoc]] ESMCTokenizer
+
+## ESMCModel
+
+[[autodoc]] ESMCModel
+    - forward
+
+## ESMCForMaskedLM
+
+[[autodoc]] ESMCForMaskedLM
+    - forward
+
+## ESMCForSequenceClassification
+
+[[autodoc]] ESMCForSequenceClassification
+    - forward
+
+## ESMCForTokenClassification
+
+[[autodoc]] ESMCForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/esmfold2.md b/docs/source/en/model_doc/esmfold2.md
@@ -0,0 +1,84 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was contributed to Hugging Face Transformers on 2026-07-02.*
+
+# ESMFold2
+
+## Overview
+
+ESMFold2 is an all-atom protein structure prediction model. It predicts 3D coordinates and per-residue confidence
+(pLDDT, PAE, PDE) directly from an amino-acid sequence, using the [ESMC](./esmc) protein language model as its
+backbone. The architecture combines a sliding-window atom encoder with 3D rotary position embeddings, a pairwise
+folding trunk applied iteratively, a diffusion-based structure head, and a confidence head.
+
+The model checkpoints are available on the Hugging Face Hub at
+[`biohub/ESMFold2`](https://huggingface.co/biohub/ESMFold2) and [`biohub/ESMFold2-Fast`](https://huggingface.co/biohub/ESMFold2-Fast) 
+
+## Usage example
+
+```python
+import torch
+
+from transformers import ESMFold2Model
+
+# The ESMC backbone is bundled in the checkpoint and loaded with the model.
+# bf16 is the recommended inference precision.
+model = ESMFold2Model.from_pretrained("biohub/ESMFold2", dtype=torch.bfloat16).cuda().eval()
+
+pdb_string = model.infer_protein_as_pdb("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ")
+with open("prediction.pdb", "w") as f:
+    f.write(pdb_string)
+```
+
+`infer_protein` returns the raw outputs (atom coordinates, distogram logits and confidence metrics) as an
+[`~models.esmfold2.modeling_esmfold2.ESMFold2Output`] if you need them instead of a PDB string. You may get
+slightly different predictions if you run the same sequence multiple times. Set a manual seed if you want exactly
+reproducible structures.
+
+## Faster inference with a fused kernel
+
+The folding trunk's dominant cost is the triangle-multiplication update. Passing `use_kernels=True` to
+[`~PreTrainedModel.from_pretrained`] swaps it for a fused Triton kernel loaded from the Hub via the
+[`kernels`](https://github.com/huggingface/kernels) library, leaving the prediction unchanged. It is inference-only and
+CUDA-only; on CPU or without the kernel installed the model transparently falls back to the pure-PyTorch implementation.
+Make sure the model is on a CUDA device when kernelization happens (e.g. with `device_map`).
+
+```python
+import torch
+
+from transformers import ESMFold2Model
+
+model = ESMFold2Model.from_pretrained(
+    "biohub/ESMFold2", dtype=torch.bfloat16, device_map="cuda", use_kernels=True
+).eval()
+
+pdb_string = model.infer_protein_as_pdb("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ")
+```
+
+## ESMFold2Config
+
+[[autodoc]] ESMFold2Config
+
+## ESMFold2PreTrainedModel
+
+[[autodoc]] ESMFold2PreTrainedModel
+
+## ESMFold2Model
+
+[[autodoc]] ESMFold2Model
+    - forward
+    - infer_protein
+    - infer_protein_as_pdb
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -790,6 +790,73 @@ def _build_checkpoint_conversion_mapping():
                 "rotary_embeddings.inv_freq",
             ),
         ],
+        "esmc": [
+            WeightRenaming(r"embed\.", "embed_tokens."),
+            WeightRenaming(r"transformer\.blocks", "layers"),
+            # The negative lookbehinds anchor the *reverse* search to the final encoder
+            # norm only (they are stripped from the forward replacement), so saving does
+            # not rewrite the "norm" inside ``input_layernorm`` / ``post_attention_layernorm``.
+            WeightRenaming(r"transformer\.norm\.", r"(?<!layer)(?<!_)norm\."),
+            WeightRenaming(r"attn\.layernorm_qkv\.layer_norm_weight", "input_layernorm.weight"),
+            WeightRenaming(r"attn\.layernorm_qkv\.layer_norm_bias", "input_layernorm.bias"),
+            WeightRenaming(r"attn\.q_ln", "self_attn.q_norm"),
+            WeightRenaming(r"attn\.k_ln", "self_attn.k_norm"),
+            WeightRenaming(r"attn\.out_proj", "self_attn.o_proj"),
+            WeightRenaming(r"ffn\.layer_norm_weight", "post_attention_layernorm.weight"),
+            WeightRenaming(r"ffn\.layer_norm_bias", "post_attention_layernorm.bias"),
+            WeightRenaming(r"ffn\.fc2_weight", "mlp.down_proj.weight"),
+            WeightConverter(
+                source_patterns=["attn.layernorm_qkv.weight"],
+                target_patterns=[
+                    "self_attn.q_proj.weight",
+                    "self_attn.k_proj.weight",
+                    "self_attn.v_proj.weight",
+                ],
+                operations=[Chunk(dim=0)],
+            ),
+            WeightConverter(
+                source_patterns=["ffn.fc1_weight"],
+                target_patterns=[
+                    "mlp.gate_proj.weight",
+                    "mlp.up_proj.weight",
+                ],
+                operations=[Chunk(dim=0)],
+            ),
+        ],
+        "ESMCForMaskedLM": [
+            WeightRenaming(r"lm_head\.0\.", "lm_head.dense."),
+            WeightRenaming(r"lm_head\.2\.", "lm_head.layer_norm."),
+            WeightRenaming(r"lm_head\.3\.", "lm_head.decoder."),
+        ],
+        "esmfold2": [
+            # TODO(temporary): the published ESMFold2 checkpoint predates the SwiGLU consolidation
+            # and still stores those blocks under their old per-module names. Drop this whole entry
+            # once the merged ESMFold2+ESMC checkpoint is regenerated with the canonical `w12`/`w3`.
+            WeightRenaming(r"\.w_up\.", ".w12."),  # SwiGLU-FFN (atom transformer)
+            WeightRenaming(r"\.w_down\.", ".w3."),
+            WeightRenaming(r"\.lin_swish\.", ".ffn.w12."),  # ConditionedTransitionBlock
+            WeightRenaming(r"\.lin_out\.", ".ffn.w3."),
+            # TODO(temporary): checkpoint predates the SWA attention q/k/v projection split (M19);
+            # it packed q/k/v into a single ``Wqkv``. Drop once the merged checkpoint is regenerated
+            # with split projections. (The pair-bias attention keeps its packed ``kv_proj`` for now:
+            # splitting it too would clash with these q/k/v names under the bidirectional converter.)
+            WeightConverter(
+                source_patterns=["attn.Wqkv.weight"],
+                target_patterns=["attn.q_proj.weight", "attn.k_proj.weight", "attn.v_proj.weight"],
+                operations=[Chunk(dim=0)],
+            ),
+            # TODO(temporary): checkpoint predates de-Sequentializing the nn.Sequential blocks (M26)
+            # into named submodules. Drop this whole group after the merged checkpoint is regenerated.
+            WeightRenaming(r"output_mlp\.0\.", "output_fc1."),  # SingleToPair (Linear, GELU, Linear)
+            WeightRenaming(r"output_mlp\.2\.", "output_fc2."),
+            WeightRenaming(r"adaln_modulation\.1\.", "adaln_linear."),  # SWAAtomBlock (SiLU, Linear)
+            WeightRenaming(r"base_z_linear\.0\.", "base_z_input_norm."),  # LM shim (Norm, Linear)
+            WeightRenaming(r"base_z_linear\.1\.", "base_z_proj."),
+            WeightRenaming(r"base_z_mlp\.0\.", "base_z_to_pair."),  # LM shim (SingleToPair, Norm)
+            WeightRenaming(r"base_z_mlp\.1\.", "base_z_output_norm."),
+            WeightRenaming(r"compute_bias\.0\.", "bias_norm."),  # MSAPairWeightedAveraging (Norm, Linear)
+            WeightRenaming(r"compute_bias\.1\.", "bias_proj."),
+        ],
         "dinov3_convnext": [WeightRenaming(r"(?<!model\.)stages", r"model.stages")],
         "dinov3_vit": [WeightRenaming(r"(?<!model\.)layer.", r"model.layer.")],
         "timesfm2_5": [

diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
@@ -134,6 +134,15 @@ def _build_kernel_mapping() -> dict:
                     trust_remote_code=True,
                 ),
             },
+            "ESMFold2TriangleMultiplication": {
+                "cuda": {
+                    Mode.INFERENCE: LayerRepository(
+                        repo_id="Rocketknight1/esmfold2-trimul-kernel",
+                        layer_name="ESMFold2TriangleMultiplication",
+                        version=1,
+                    ),
+                },
+            },
             "SwiGLUMLP": {
                 "cuda": {
                     Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -138,6 +138,8 @@
     from .ernie4_5_moe import *
     from .ernie4_5_vl_moe import *
     from .esm import *
+    from .esmc import *
+    from .esmfold2 import *
     from .evolla import *
     from .exaone4 import *
     from .exaone4_5 import *

diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py
@@ -176,6 +176,8 @@
         ("ernie4_5_vl_moe_text", "Ernie4_5_VLMoeTextConfig"),
         ("ernie4_5_vl_moe_vision", "Ernie4_5_VLMoeVisionConfig"),
         ("esm", "EsmConfig"),
+        ("esmc", "ESMCConfig"),
+        ("esmfold2", "ESMFold2Config"),
         ("eurobert", "EuroBertConfig"),
         ("evolla", "EvollaConfig"),
         ("exaone4", "Exaone4Config"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -152,6 +152,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("ernie4_5_moe", "Ernie4_5_MoeModel"),
         ("ernie4_5_vl_moe", "Ernie4_5_VLMoeModel"),
         ("esm", "EsmModel"),
+        ("esmc", "ESMCModel"),
+        ("esmfold2", "ESMFold2Model"),
         ("eurobert", "EuroBertModel"),
         ("evolla", "EvollaModel"),
         ("exaone4", "Exaone4Model"),
@@ -1145,6 +1147,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("electra", "ElectraForMaskedLM"),
         ("ernie", "ErnieForMaskedLM"),
         ("esm", "EsmForMaskedLM"),
+        ("esmc", "ESMCForMaskedLM"),
         ("eurobert", "EuroBertForMaskedLM"),
         ("flaubert", "FlaubertWithLMHeadModel"),
         ("fnet", "FNetForMaskedLM"),
@@ -1339,6 +1342,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("electra", "ElectraForSequenceClassification"),
         ("ernie", "ErnieForSequenceClassification"),
         ("esm", "EsmForSequenceClassification"),
+        ("esmc", "ESMCForSequenceClassification"),
         ("eurobert", "EuroBertForSequenceClassification"),
         ("exaone4", "Exaone4ForSequenceClassification"),
         ("falcon", "FalconForSequenceClassification"),
@@ -1568,6 +1572,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("electra", "ElectraForTokenClassification"),
         ("ernie", "ErnieForTokenClassification"),
         ("esm", "EsmForTokenClassification"),
+        ("esmc", "ESMCForTokenClassification"),
         ("eurobert", "EuroBertForTokenClassification"),
         ("exaone4", "Exaone4ForTokenClassification"),
         ("falcon", "FalconForTokenClassification"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -119,6 +119,8 @@
         ("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
         ("ernie", "BertTokenizer" if is_tokenizers_available() else None),
         ("esm", "EsmTokenizer"),
+        ("esmc", "ESMCTokenizer" if is_tokenizers_available() else None),
+        ("esmfold2", "ESMCTokenizer" if is_tokenizers_available() else None),
         ("falcon_mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
         ("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
         ("flaubert", "FlaubertTokenizer"),

diff --git a/src/transformers/models/esmc/__init__.py b/src/transformers/models/esmc/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2026 Biohub. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_esmc import *
+    from .modeling_esmc import *
+    from .tokenization_esmc import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)