From 730c87629a91a705e0177877c907d1d9e9249cf4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 18:30:27 +0000
Subject: [PATCH 01/20] New tiny model generation

---
 scripts/generate_tiny_models.py               | 437 ------------------
 scripts/generate_tiny_models/README.md        |  50 ++
 scripts/generate_tiny_models/__init__.py      |   0
 scripts/generate_tiny_models/_common.py       | 253 ++++++++++
 .../for_causal_lm/__init__.py                 |   0
 .../for_causal_lm/cohere2_for_causal_lm.py    |  40 ++
 .../for_causal_lm/cohere_for_causal_lm.py     |  40 ++
 .../deepseek_v3_for_causal_lm.py              |  40 ++
 .../deepseek_v3_for_causal_lm_0528.py         |  42 ++
 .../falcon_mamba_for_causal_lm.py             |  40 ++
 .../for_causal_lm/gemma2_for_causal_lm.py     |  40 ++
 .../for_causal_lm/gemma_for_causal_lm.py      |  40 ++
 .../for_causal_lm/glm4_moe_for_causal_lm.py   |  42 ++
 .../for_causal_lm/gpt2_lm_head_model.py       |  40 ++
 .../for_causal_lm/gpt_neox_for_causal_lm.py   |  40 ++
 .../for_causal_lm/gpt_oss_for_causal_lm.py    |  42 ++
 .../for_causal_lm/llama_for_causal_lm_3.py    |  40 ++
 .../for_causal_lm/llama_for_causal_lm_3_1.py  |  40 ++
 .../for_causal_lm/llama_for_causal_lm_3_2.py  |  40 ++
 .../mistral_for_causal_lm_0_1.py              |  40 ++
 .../mistral_for_causal_lm_0_2.py              |  40 ++
 .../for_causal_lm/opt_for_causal_lm.py        |  40 ++
 .../for_causal_lm/peft_qwen3_for_causal_lm.py |  29 ++
 .../peft_qwen3_for_causal_lm_2.py             |  31 ++
 .../for_causal_lm/phi3_for_causal_lm_3.py     |  40 ++
 .../for_causal_lm/phi3_for_causal_lm_3_5.py   |  40 ++
 .../for_causal_lm/qwen2_for_causal_lm_2_5.py  |  40 ++
 .../qwen2_for_causal_lm_2_5_coder.py          |  40 ++
 .../for_causal_lm/qwen3_for_causal_lm.py      |  42 ++
 .../for_causal_lm/qwen3_moe_for_causal_lm.py  |  42 ++
 .../small_qwen2_for_causal_lm_2_5.py          |  41 ++
 .../small_qwen3_for_causal_lm.py              |  41 ++
 .../for_conditional_generation/__init__.py    |   0
 .../for_conditional_generation/bart_model.py  |  32 ++
 .../gemma3_for_conditional_generation.py      |  48 ++
 .../gemma4_for_conditional_generation.py      |  60 +++
 .../idefics2_for_conditional_generation.py    |  53 +++
 .../idefics3_for_conditional_generation.py    |  48 ++
 .../internvl_for_conditional_generation.py    |  48 ++
 .../llava_for_conditional_generation.py       |  48 ++
 .../llava_next_for_conditional_generation.py  |  54 +++
 .../paligemma_for_conditional_generation.py   |  48 ++
 .../qwen2_5_vl_for_conditional_generation.py  |  62 +++
 .../qwen2_vl_for_conditional_generation.py    |  57 +++
 .../qwen3_5_for_conditional_generation.py     |  62 +++
 .../qwen3_vl_for_conditional_generation.py    |  56 +++
 .../smolvlm_for_conditional_generation.py     |  48 ++
 .../t5_for_conditional_generation.py          |  33 ++
 .../for_sequence_classification/__init__.py   |   0
 .../gpt_neox_for_sequence_classification.py   |  41 ++
 .../llama_for_sequence_classification_3_2.py  |  41 ++
 .../qwen2_for_sequence_classification_2_5.py  |  43 ++
 .../qwen3_for_sequence_classification.py      |  43 ++
 .../qwen3_moe_for_sequence_classification.py  |  42 ++
 tests/conftest.py                             |   1 +
 55 files changed, 2343 insertions(+), 437 deletions(-)
 delete mode 100644 scripts/generate_tiny_models.py
 create mode 100644 scripts/generate_tiny_models/README.md
 create mode 100644 scripts/generate_tiny_models/__init__.py
 create mode 100644 scripts/generate_tiny_models/_common.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/__init__.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/__init__.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/bart_model.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/__init__.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
 create mode 100644 scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py

diff --git a/scripts/generate_tiny_models.py b/scripts/generate_tiny_models.py
deleted file mode 100644
index 193dee9e080..00000000000
--- a/scripts/generate_tiny_models.py
+++ /dev/null
@@ -1,437 +0,0 @@
-# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script generates tiny models used in the TRL library for unit tests. It pushes them to the Hub under the
-# `trl-internal-testing` organization.
-# This script is meant to be run when adding new tiny model to the TRL library.
-
-import torch
-from huggingface_hub import HfApi, ModelCard
-from peft import LoraConfig, get_peft_model
-from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoProcessor,
-    AutoTokenizer,
-    BartModel,
-    Cohere2Config,
-    Cohere2ForCausalLM,
-    CohereConfig,
-    CohereForCausalLM,
-    DeepseekV3Config,
-    DeepseekV3ForCausalLM,
-    FalconMambaConfig,
-    FalconMambaForCausalLM,
-    Gemma2Config,
-    Gemma2ForCausalLM,
-    Gemma3ForConditionalGeneration,
-    Gemma4ForConditionalGeneration,
-    GemmaConfig,
-    GemmaForCausalLM,
-    GenerationConfig,
-    Glm4MoeConfig,
-    Glm4MoeForCausalLM,
-    GPT2Config,
-    GPT2LMHeadModel,
-    GPTNeoXConfig,
-    GPTNeoXForCausalLM,
-    GPTNeoXForSequenceClassification,
-    GptOssConfig,
-    GptOssForCausalLM,
-    Idefics2Config,
-    Idefics2ForConditionalGeneration,
-    Idefics3ForConditionalGeneration,
-    InternVLForConditionalGeneration,
-    LlamaConfig,
-    LlamaForCausalLM,
-    LlamaForSequenceClassification,
-    LlavaForConditionalGeneration,
-    LlavaNextForConditionalGeneration,
-    MistralConfig,
-    MistralForCausalLM,
-    OPTConfig,
-    OPTForCausalLM,
-    PaliGemmaForConditionalGeneration,
-    Phi3Config,
-    Phi3ForCausalLM,
-    Qwen2_5_VLConfig,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2Config,
-    Qwen2ForCausalLM,
-    Qwen2ForSequenceClassification,
-    Qwen2VLConfig,
-    Qwen2VLForConditionalGeneration,
-    Qwen3_5Config,
-    Qwen3_5ForConditionalGeneration,
-    Qwen3Config,
-    Qwen3ForCausalLM,
-    Qwen3ForSequenceClassification,
-    Qwen3MoeConfig,
-    Qwen3MoeForCausalLM,
-    Qwen3MoeForSequenceClassification,
-    Qwen3VLConfig,
-    Qwen3VLForConditionalGeneration,
-    SmolVLMForConditionalGeneration,
-    T5ForConditionalGeneration,
-)
-
-
-ORGANIZATION = "trl-internal-testing"
-
-MODEL_CARD = """
----
-library_name: transformers
-tags: [trl]
----
-
-# Tiny {model_class_name}
-
-This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library.
-"""
-
-
-api = HfApi()
-
-
-def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, force=False):
-    model_class_name = model.__class__.__name__
-    content = MODEL_CARD.format(model_class_name=model_class_name)
-    model_card = ModelCard(content)
-    if prefix is not None:
-        model_class_name = f"{prefix}-{model_class_name}"
-    repo_id = f"{ORGANIZATION}/{model_class_name}"
-    if suffix is not None:
-        repo_id += f"-{suffix}"
-
-    if api.repo_exists(repo_id) and not force:
-        print(f"Model {repo_id} already exists, skipping")
-    else:
-        model.push_to_hub(repo_id)
-        model_card.push_to_hub(repo_id)
-        if tokenizer is not None:
-            tokenizer.push_to_hub(repo_id)
-        if generation_config is not None:
-            generation_config.push_to_hub(repo_id)
-
-
-def init_weights_tiny_model(model):
-    """
-    Initialize tiny test models to avoid NaNs from uninitialized weights.
-
-    Uses safe defaults:
-      - Linear/Conv1d: Xavier uniform (weights), zero (biases)
-      - Embedding: Normal(0, 0.02)
-      - LayerNorm: Ones (weights), zero (biases)
-
-    Args:
-        model: PyTorch model (modified in-place)
-    """
-    for module in model.modules():
-        if isinstance(module, nn.Linear):
-            # Attention/MLP projections → Xavier or Normal
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-            nn.init.xavier_uniform_(module.weight)
-
-        elif isinstance(module, nn.Embedding):
-            # Token embeddings → GPT-style Normal
-            nn.init.normal_(module.weight, mean=0.0, std=0.02)
-
-        elif isinstance(module, nn.LayerNorm):
-            # LayerNorm weights always 1, bias 0
-            nn.init.ones_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-
-        elif isinstance(module, nn.Conv1d):
-            # Convolutional layers → Xavier or Normal
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-            nn.init.xavier_uniform_(module.weight)
-
-
-# Decoder models
-for model_id, config_class, model_class, dtype, suffix in [
-    # ("bigscience/bloomz-560m", BloomConfig, BloomForCausalLM, None),  # loading fails with this model, see https://huggingface.co/bigscience/bloomz-560m/discussions/14
-    ("CohereLabs/aya-expanse-8b", CohereConfig, CohereForCausalLM, torch.float16, None),
-    ("CohereLabs/tiny-aya-earth", Cohere2Config, Cohere2ForCausalLM, torch.bfloat16, None),
-    ("deepseek-ai/DeepSeek-R1", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, None),
-    # It's important to have R1-0528 as it doesn't have the same chat template
-    ("deepseek-ai/DeepSeek-R1-0528", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, "0528"),
-    ("tiiuae/falcon-7b-instruct", FalconMambaConfig, FalconMambaForCausalLM, torch.bfloat16, None),
-    ("google/gemma-2-2b-it", Gemma2Config, Gemma2ForCausalLM, torch.bfloat16, None),
-    ("google/gemma-7b-it", GemmaConfig, GemmaForCausalLM, torch.bfloat16, None),
-    ("openai-community/gpt2", GPT2Config, GPT2LMHeadModel, torch.float32, None),
-    ("EleutherAI/pythia-14m", GPTNeoXConfig, GPTNeoXForCausalLM, torch.float16, None),
-    ("meta-llama/Meta-Llama-3-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3"),
-    ("meta-llama/Llama-3.1-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.1"),
-    ("meta-llama/Llama-3.2-1B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.2"),
-    ("mistralai/Mistral-7B-Instruct-v0.1", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.1"),
-    ("mistralai/Mistral-7B-Instruct-v0.2", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.2"),
-    ("facebook/opt-1.3b", OPTConfig, OPTForCausalLM, torch.float16, None),
-    ("microsoft/Phi-3-mini-4k-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3"),
-    ("microsoft/Phi-3.5-mini-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3.5"),
-    ("Qwen/Qwen2.5-32B-Instruct", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5"),
-    ("Qwen/Qwen2.5-Coder-0.5B", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5-Coder"),
-    ("Qwen/Qwen3-8B", Qwen3Config, Qwen3ForCausalLM, torch.bfloat16, None),
-]:
-    revision = "refs/pr/14" if model_id == "Qwen/Qwen3-8B" else "main"  # chat template with {% generation %}
-    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
-    generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
-    config = config_class(
-        vocab_size=len(tokenizer.vocab),
-        hidden_size=8,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        num_hidden_layers=2,
-        intermediate_size=32,
-    )
-    model = model_class(config).to(dtype=dtype)
-    init_weights_tiny_model(model)
-    push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
-
-# MoE models
-for model_id, config_class, model_class, dtype, suffix in [
-    ("Qwen/Qwen3-30B-A3B", Qwen3MoeConfig, Qwen3MoeForCausalLM, torch.bfloat16, None),
-    ("openai/gpt-oss-20b", GptOssConfig, GptOssForCausalLM, torch.bfloat16, None),
-    ("zai-org/GLM-4.5", Glm4MoeConfig, Glm4MoeForCausalLM, torch.bfloat16, None),
-]:
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    generation_config = GenerationConfig.from_pretrained(model_id)
-    kwargs = {}
-    if model_id == "zai-org/GLM-4.5":
-        kwargs["n_routed_experts"] = 4
-    elif model_id == "Qwen/Qwen3-30B-A3B":
-        kwargs["num_experts"] = 4
-    elif model_id == "openai/gpt-oss-20b":
-        kwargs["num_local_experts"] = 4
-
-    config = config_class(
-        vocab_size=len(tokenizer.vocab),
-        hidden_size=8,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        num_hidden_layers=2,
-        intermediate_size=32,
-        num_experts_per_tok=2,
-        **kwargs,
-    )
-    model = model_class(config).to(dtype=dtype)
-    init_weights_tiny_model(model)
-    push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
-
-# Two slightly bigger models, required for vLLM testing
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
-generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
-config = Qwen2Config(
-    vocab_size=len(tokenizer.vocab),
-    hidden_size=128,  # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM
-    num_attention_heads=4,
-    num_key_value_heads=2,
-    num_hidden_layers=2,
-    intermediate_size=32,
-)
-model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16)
-push_to_hub(model, tokenizer, generation_config, "small", "2.5")
-
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
-generation_config = GenerationConfig.from_pretrained("Qwen/Qwen3-4B")
-config = Qwen3Config(
-    vocab_size=len(tokenizer.vocab),
-    hidden_size=128,  # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM
-    num_attention_heads=4,
-    num_key_value_heads=2,
-    num_hidden_layers=2,
-    intermediate_size=32,
-)
-model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16)
-push_to_hub(model, tokenizer, generation_config, "small")
-
-# Reward models
-for model_id, model_class, dtype, suffix in [
-    ("EleutherAI/pythia-14m", GPTNeoXForSequenceClassification, torch.bfloat16, None),
-    ("meta-llama/Llama-3.2-1B-Instruct", LlamaForSequenceClassification, torch.bfloat16, "3.2"),
-    ("Qwen/Qwen2.5-32B-Instruct", Qwen2ForSequenceClassification, torch.bfloat16, "2.5"),
-    ("Qwen/Qwen3-4B", Qwen3ForSequenceClassification, torch.bfloat16, None),
-]:
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    generation_config = GenerationConfig.from_pretrained(model_id)
-    kwargs = {
-        "num_labels": 1,
-        "hidden_size": 16,
-        "num_attention_heads": 4,
-        "num_key_value_heads": 2,
-        "num_hidden_layers": 2,
-        "intermediate_size": 32,
-    }
-    config = AutoConfig.from_pretrained(model_id, **kwargs)
-    # Bug in transformers: it ignores num_hidden_layers to build layer_types
-    if model_id in ("Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen3-4B"):
-        config.layer_types = config.layer_types[:2]
-    model = model_class(config).to(dtype=dtype)
-    init_weights_tiny_model(model)
-    push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
-
-# MoE Reward models
-for model_id, model_class, dtype, suffix in [
-    ("Qwen/Qwen3-30B-A3B", Qwen3MoeForSequenceClassification, torch.bfloat16, None),
-]:
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    generation_config = GenerationConfig.from_pretrained(model_id)
-    kwargs = {
-        "num_labels": 1,
-        "hidden_size": 16,
-        "num_attention_heads": 4,
-        "num_key_value_heads": 2,
-        "num_hidden_layers": 2,
-        "intermediate_size": 32,
-        "num_experts": 4,
-        "num_experts_per_tok": 2,
-    }
-    config = AutoConfig.from_pretrained(model_id, **kwargs)
-    model = model_class(config).to(dtype=dtype)
-    push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
-
-
-# Encoder-decoder models
-for model_id, model_class, dtype, suffix in [
-    ("facebook/bart-base", BartModel, torch.float32, None),
-    ("google/flan-t5-small", T5ForConditionalGeneration, torch.float32, None),
-]:
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "facebook/bart-base" else None
-    config = AutoConfig.from_pretrained(model_id)
-    config.d_model = 24
-    model = model_class(config).to(dtype=dtype)
-    push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
-
-
-# Vision Language Models
-for model_id, model_class, dtype in [
-    ("google/gemma-3-4b-it", Gemma3ForConditionalGeneration, torch.bfloat16),
-    ("google/gemma-4-E2B-it", Gemma4ForConditionalGeneration, torch.bfloat16),
-    ("google/paligemma-3b-pt-224", PaliGemmaForConditionalGeneration, torch.float32),
-    ("HuggingFaceM4/idefics2-8b", Idefics2ForConditionalGeneration, torch.float32),
-    ("HuggingFaceM4/Idefics3-8B-Llama3", Idefics3ForConditionalGeneration, torch.bfloat16),
-    ("HuggingFaceTB/SmolVLM2-2.2B-Instruct", SmolVLMForConditionalGeneration, torch.float32),
-    ("llava-hf/llava-1.5-7b-hf", LlavaForConditionalGeneration, torch.float16),
-    # Original model dtype is float16, but it triggers CUDA device side assert error (see GH-4741):
-    ("llava-hf/llava-v1.6-mistral-7b-hf", LlavaNextForConditionalGeneration, torch.bfloat16),
-    ("OpenGVLab/InternVL3-8B-hf", InternVLForConditionalGeneration, torch.bfloat16),
-    ("Qwen/Qwen2-VL-2B-Instruct", Qwen2VLForConditionalGeneration, torch.bfloat16),
-    ("Qwen/Qwen2.5-VL-3B-Instruct", Qwen2_5_VLForConditionalGeneration, torch.bfloat16),
-    ("Qwen/Qwen3-VL-2B-Instruct", Qwen3VLForConditionalGeneration, torch.bfloat16),
-    ("Qwen/Qwen3.5-0.8B", Qwen3_5ForConditionalGeneration, torch.bfloat16),
-]:
-    processor = AutoProcessor.from_pretrained(model_id)
-    generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "Qwen/Qwen3.5-0.8B" else None
-
-    text_config = {
-        "num_hidden_layers": 2,
-        "hidden_size": 16,
-        "num_attention_heads": 4,
-        "num_key_value_heads": 2,
-        "layer_types": None,  # Set it automatically from num_hidden_layers
-    }
-    vision_config = {
-        "num_hidden_layers": 2,
-        "hidden_size": 16,
-        "num_attention_heads": 4,
-        "num_key_value_heads": 2,
-        "embed_dim": 64,
-    }
-    kwargs = {}
-
-    if issubclass(model_class.config_class, (Qwen2VLConfig, Qwen2_5_VLConfig)):
-        text_config["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"}
-        vision_config["depth"] = 2
-        # Different dict object from text_config; see GH-4101 and transformers#41020
-        kwargs["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"}
-
-    if issubclass(model_class.config_class, Qwen2_5_VLConfig):
-        vision_config["out_hidden_size"] = 16
-        # Different dict object at the config root; see GH-4101 and transformers#41020
-        kwargs["num_hidden_layers"] = 2
-        kwargs["hidden_size"] = 16
-        kwargs["num_attention_heads"] = 4
-
-    if issubclass(model_class.config_class, Idefics2Config):
-        kwargs["perceiver_config"] = {"hidden_size": 16}
-
-    if issubclass(model_class.config_class, Qwen3VLConfig):
-        # So hasattr(config, "layer_types") is False
-        # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L420
-        del text_config["layer_types"]
-        # "mrope_section" needs 3 elements: for dim, offset in enumerate((1, 2), start=1): mrope_section[dim]
-        # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L361
-        text_config["rope_scaling"] = {"mrope_interleaved": True, "mrope_section": [2, 2, 2], "rope_type": "default"}
-        vision_config["depth"] = 2
-        vision_config["out_hidden_size"] = 16
-
-    if issubclass(model_class.config_class, Qwen3_5Config):
-        # For tiny layer counts, default `layer_types` can end up with no full-attention layers (e.g. 2 layers and
-        # default interval 4), which breaks Qwen3.5 dynamic cache logic. Keep one full-attention layer at the end.
-        text_config["layer_types"] = ["linear_attention", "full_attention"]
-        text_config["full_attention_interval"] = 2
-        # Qwen3.5-VL vision config expects `depth`/`num_heads`, not `num_hidden_layers`/`num_attention_heads`.
-        vision_config.pop("num_hidden_layers", None)
-        vision_config.pop("num_attention_heads", None)
-        vision_config.pop("num_key_value_heads", None)
-        vision_config.pop("embed_dim", None)
-        vision_config["depth"] = 2
-        vision_config["num_heads"] = 4
-        vision_config["intermediate_size"] = 32
-        vision_config["out_hidden_size"] = 16
-
-    if model_id == "llava-hf/llava-v1.6-mistral-7b-hf":
-        # Hotfix: llava-hf/llava-v1.6-mistral-7b-hf mistakesly sets text_config.dtype to "bfloat16".
-        # See https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/discussions/46
-        text_config["dtype"] = None
-
-    if model_class is Gemma4ForConditionalGeneration:
-        # Gemma4 rope validation fails when passing text_config as a dict, so we mutate the config directly.
-        config = AutoConfig.from_pretrained(model_id)
-        for k, v in text_config.items():
-            setattr(config.text_config, k, v)
-        for k, v in vision_config.items():
-            setattr(config.vision_config, k, v)
-        config.text_config.layer_types = ["sliding_attention", "full_attention"]
-        config.text_config.num_kv_shared_layers = 0
-        config.text_config.global_head_dim = 8
-        config.text_config.hidden_size_per_layer_input = 16
-        config.audio_config = None
-    else:
-        config = AutoConfig.from_pretrained(model_id, text_config=text_config, vision_config=vision_config, **kwargs)
-    model = model_class(config).to(dtype=dtype)
-
-    if issubclass(model_class.config_class, Qwen3_5Config):
-        # Qwen3.5 models has some weights in float32, to mirror this in the tiny model we need to convert them to float32 manually.
-        for layer in model.model.language_model.layers:
-            if hasattr(layer, "linear_attn"):  # applies to linear attention layers only
-                layer.linear_attn.A_log.data = layer.linear_attn.A_log.data.float()
-                layer.linear_attn.norm.weight.data = layer.linear_attn.norm.weight.data.float()
-
-    push_to_hub(model, processor, generation_config, "tiny")
-
-# PEFT models
-model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto")
-model = get_peft_model(model, LoraConfig())
-generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM")
-push_to_hub(model, None, None, "tiny")
-
-# Same model, but different weights
-model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto")
-model = get_peft_model(model, LoraConfig())
-generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM")
-push_to_hub(model, None, None, "tiny", "2")
diff --git a/scripts/generate_tiny_models/README.md b/scripts/generate_tiny_models/README.md
new file mode 100644
index 00000000000..14d96a793c7
--- /dev/null
+++ b/scripts/generate_tiny_models/README.md
@@ -0,0 +1,50 @@
+# Tiny model generation
+
+This directory contains one script per tiny model used by the TRL test suite. Each script builds a random-weight, minimally-sized model on top of a real tokenizer/processor and pushes it to the `trl-internal-testing` organization on the Hub.
+
+## Layout
+
+```
+generate_tiny_models/
+├── _common.py                               # shared helpers (push_to_hub, smoke_test, print_config_diff, ...)
+├── for_causal_lm/                # *ForCausalLM + GPT-2 LM head + small/PEFT variants
+├── for_sequence_classification/  # *ForSequenceClassification (reward models)
+└── for_conditional_generation/   # *ForConditionalGeneration (VLMs + T5 + Bart encoder-decoder)
+```
+
+## Running
+
+From the repo root, invoke a script by its module path:
+
+```bash
+python -m scripts.generate_tiny_models.for_causal_lm.qwen3_for_causal_lm
+```
+
+Each script:
+
+1. Checks that the installed `transformers` version matches the one pinned in the script (fails otherwise).
+2. Builds the tiny model with random weights.
+3. Runs `smoke_test` — a minimal forward pass to catch config misspecification and NaNs.
+4. Runs `print_config_diff` — prints every flat-key difference between the reference Hub config and the tiny model's config (for debugging scale-downs).
+5. Pushes the model, tokenizer/processor, generation config, and model card to the Hub.
+
+If the repo already exists on the Hub, the push is skipped (pass `force=True` in `push_to_hub(...)` to overwrite).
+
+## Version pinning
+
+Every script declares `TRANSFORMERS_VERSION = "X.Y.Z"`, which is:
+
+```
+max(version that introduced the model, TRL's transformers floor)
+```
+
+The floor (currently `4.56.2`) is the `transformers>=` bound from `pyproject.toml`. Scripts for models introduced after the floor pin a higher version (e.g. Qwen3-VL pins `4.57.0`, Gemma4 pins `5.6.0`). The check is an exact match via `packaging.version.Version`; install the pinned version before running.
+
+**Why exact?** transformers is backward-compatible (a checkpoint saved by X loads on any ≥ X) but not forward-compatible. TRL CI runs against the floor, so tiny models must be saved with the oldest version that supports them — any newer save risks using config fields the floor can't parse. The exact-match check prevents accidental drift.
+
+## Adding a new tiny model
+
+1. Pick the right subfolder based on the model class suffix (`ForCausalLM`, `ForSequenceClassification`, `ForConditionalGeneration`).
+2. Copy an existing script with the closest shape and adapt it — reference model id, config class, model class, special kwargs.
+3. Set `TRANSFORMERS_VERSION` to the release that introduced the model (or to the TRL floor, whichever is higher).
+4. Run it. Inspect the `[smoke_test]` and `[config_diff]` output before letting it push.
diff --git a/scripts/generate_tiny_models/__init__.py b/scripts/generate_tiny_models/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
new file mode 100644
index 00000000000..1f713853f59
--- /dev/null
+++ b/scripts/generate_tiny_models/_common.py
@@ -0,0 +1,253 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Shared utilities for the tiny-model generation scripts in this directory.
+# Each sibling script builds a single tiny model and pushes it to the Hub under
+# the `trl-internal-testing` organization.
+
+import argparse
+import os
+import tempfile
+
+import torch
+from huggingface_hub import CommitOperationAdd, HfApi, ModelCard
+from packaging.version import Version
+from torch import nn
+from transformers import AutoConfig
+
+
+ORGANIZATION = "trl-internal-testing"
+
+MODEL_CARD = """
+---
+library_name: transformers
+tags: [trl]
+---
+
+# Tiny {model_class_name}
+
+This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library.
+"""
+
+
+api = HfApi()
+
+
+def check_transformers_version(expected_version):
+    """Raise unless the installed transformers matches `expected_version` exactly."""
+    import transformers
+
+    if Version(transformers.__version__) != Version(expected_version):
+        raise RuntimeError(
+            f"This script requires transformers=={expected_version}, " f"but {transformers.__version__} is installed."
+        )
+
+
+def smoke_test(model, tokenizer_or_processor=None):
+    """Run a minimal forward pass to sanity-check the tiny model doesn't crash or produce NaNs."""
+    model.eval()
+    device = next(model.parameters()).device
+
+    if tokenizer_or_processor is not None and hasattr(tokenizer_or_processor, "image_processor"):
+        # VLM path: build a dummy (image, text) input via the processor.
+        from PIL import Image
+
+        processor = tokenizer_or_processor
+        red = Image.new("RGB", (24, 24), color="red")
+        blue = Image.new("RGB", (24, 24), color="blue")
+        messages = [
+            [{"role": "user", "content": [{"type": "image", "image": red}, {"type": "text", "text": "What is this?"}]}],
+            [{"role": "user", "content": [{"type": "text", "text": "Is it blue?"}, {"type": "image", "image": blue}]}],
+        ]
+        inputs = processor.apply_chat_template(
+            conversation=messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+    else:
+        inputs = {"input_ids": torch.tensor([[1, 2, 3, 4]], device=device)}
+
+    with torch.no_grad():
+        out = model(**inputs)
+
+    logits = getattr(out, "logits", None)
+    if logits is None:
+        logits = getattr(out, "last_hidden_state", None)
+    if logits is None:
+        raise RuntimeError(f"[smoke_test] {model.__class__.__name__}: no logits or last_hidden_state on output")
+    if torch.isnan(logits).any():
+        raise RuntimeError(f"[smoke_test] {model.__class__.__name__}: NaN in forward output")
+    print(f"[smoke_test] {model.__class__.__name__}: OK (output shape {tuple(logits.shape)})")
+
+
+def _flatten(d, prefix=""):
+    out = {}
+    for k, v in d.items():
+        key = f"{prefix}{k}" if prefix else k
+        if isinstance(v, dict):
+            out.update(_flatten(v, f"{key}."))
+        else:
+            out[key] = v
+    return out
+
+
+_DIFF_IGNORE = {"_name_or_path", "transformers_version", "architectures", "model_type", "torch_dtype", "dtype"}
+
+
+_TORCH_TO_SAFETENSORS_DTYPE = {
+    torch.float32: "F32",
+    torch.float16: "F16",
+    torch.bfloat16: "BF16",
+    torch.float64: "F64",
+    torch.int8: "I8",
+    torch.int16: "I16",
+    torch.int32: "I32",
+    torch.int64: "I64",
+    torch.uint8: "U8",
+    torch.bool: "BOOL",
+}
+
+
+def check_dtype_pattern(reference_id, model):
+    """Flag tensors whose dtype diverges from the reference checkpoint.
+
+    Reads the reference safetensors header via the Hub API (no weight download). Useful to catch cases
+    like Qwen3.5 where specific params (e.g. linear_attn.A_log) are kept in fp32 while the rest is bf16.
+    """
+    metadata = api.get_safetensors_metadata(reference_id)
+    ref_dtypes = {name: info.dtype for fm in metadata.files_metadata.values() for name, info in fm.tensors.items()}
+
+    mismatches = []
+    for name, tensor in model.state_dict().items():
+        ref_dtype = ref_dtypes.get(name)
+        if ref_dtype is None:
+            continue  # tensor has no counterpart in the reference (e.g. scale-down, PEFT wrapper, tying)
+        tiny_dtype = _TORCH_TO_SAFETENSORS_DTYPE.get(tensor.dtype)
+        if tiny_dtype != ref_dtype:
+            mismatches.append((name, ref_dtype, tiny_dtype))
+
+    if not mismatches:
+        print(f"[dtype_check] {reference_id}: all matched tensors have the reference dtype")
+        return
+
+    print(f"[dtype_check] {reference_id}: {len(mismatches)} tensors differ from reference:")
+    for name, ref, tiny in mismatches:
+        print(f"  {name}: reference={ref}, tiny={tiny}")
+
+
+def print_config_diff(reference_id, model):
+    """Print the flat, recursive diff between the reference Hub config and the tiny-model config."""
+    reference_config = AutoConfig.from_pretrained(reference_id)
+    ref_flat = _flatten(reference_config.to_dict())
+    tiny_flat = _flatten(model.config.to_dict())
+
+    keys = sorted(set(ref_flat) | set(tiny_flat))
+    rows = []
+    for k in keys:
+        if any(k == ig or k.endswith(f".{ig}") for ig in _DIFF_IGNORE):
+            continue
+        rv, tv = ref_flat.get(k, "<missing>"), tiny_flat.get(k, "<missing>")
+        if rv != tv:
+            rows.append((k, rv, tv))
+
+    print(f"[config_diff] {reference_id} vs tiny ({len(rows)} differences)")
+    for k, r, t in rows:
+        print(f"  {k:48s} {str(r)[:34]:34s} → {str(t)[:34]}")
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument(
+        "--create-pr",
+        action="store_true",
+        help="If the repo already exists, open a PR instead of skipping.",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, force=False, create_pr=None):
+    if create_pr is None:
+        create_pr = _parse_args().create_pr
+
+    model_class_name = model.__class__.__name__
+    content = MODEL_CARD.format(model_class_name=model_class_name)
+    model_card = ModelCard(content)
+    if prefix is not None:
+        model_class_name = f"{prefix}-{model_class_name}"
+    repo_id = f"{ORGANIZATION}/{model_class_name}"
+    if suffix is not None:
+        repo_id += f"-{suffix}"
+
+    exists = api.repo_exists(repo_id)
+    if exists and not force and not create_pr:
+        print(f"Model {repo_id} already exists, skipping (pass --create-pr to open a PR)")
+        return
+
+    if not exists:
+        api.create_repo(repo_id, exist_ok=True)
+
+    # Save all artifacts to a temp dir and upload them in a single commit, so --create-pr opens one PR.
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model.save_pretrained(tmpdir)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(tmpdir)
+        if generation_config is not None:
+            generation_config.save_pretrained(tmpdir)
+        model_card.save(os.path.join(tmpdir, "README.md"))
+
+        operations = [
+            CommitOperationAdd(path_in_repo=os.path.relpath(os.path.join(root, name), tmpdir),
+                               path_or_fileobj=os.path.join(root, name))
+            for root, _, files in os.walk(tmpdir)
+            for name in files
+        ]
+        api.create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=f"Upload tiny {model.__class__.__name__}",
+            create_pr=exists and create_pr,
+        )
+
+
+def init_weights_tiny_model(model):
+    """
+    Initialize tiny test models to avoid NaNs from uninitialized weights.
+
+    Uses safe defaults:
+      - Linear/Conv1d: Xavier uniform (weights), zero (biases)
+      - Embedding: Normal(0, 0.02)
+      - LayerNorm: Ones (weights), zero (biases)
+    """
+    for module in model.modules():
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+            nn.init.xavier_uniform_(module.weight)
+
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+
+        elif isinstance(module, nn.Conv1d):
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+            nn.init.xavier_uniform_(module.weight)
diff --git a/scripts/generate_tiny_models/for_causal_lm/__init__.py b/scripts/generate_tiny_models/for_causal_lm/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
new file mode 100644
index 00000000000..fe1d72eae89
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "CohereLabs/tiny-aya-earth"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Cohere2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Cohere2ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
new file mode 100644
index 00000000000..48336e33ecc
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "CohereLabs/aya-expanse-8b"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = CohereConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = CohereForCausalLM(config).to(dtype=torch.float16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
new file mode 100644
index 00000000000..0c29cffbae1
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "deepseek-ai/DeepSeek-R1"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = DeepseekV3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
new file mode 100644
index 00000000000..a29bd6cec0e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: R1-0528 is kept in addition to R1 because it has a different chat template.
+
+import torch
+from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "deepseek-ai/DeepSeek-R1-0528"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = DeepseekV3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "0528")
diff --git a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
new file mode 100644
index 00000000000..502bdd35831
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "tiiuae/falcon-7b-instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = FalconMambaConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = FalconMambaForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
new file mode 100644
index 00000000000..3d96bd09a9a
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/gemma-2-2b-it"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Gemma2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Gemma2ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
new file mode 100644
index 00000000000..b391e48473a
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/gemma-7b-it"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = GemmaConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = GemmaForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
new file mode 100644
index 00000000000..c07297fd00e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "zai-org/GLM-4.5"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Glm4MoeConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+    n_routed_experts=4,
+    num_experts_per_tok=2,
+)
+model = Glm4MoeForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
new file mode 100644
index 00000000000..758841b0b33
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, GPT2Config, GPT2LMHeadModel
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "openai-community/gpt2"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = GPT2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = GPT2LMHeadModel(config).to(dtype=torch.float32)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
new file mode 100644
index 00000000000..f2eed955e9f
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, GPTNeoXConfig, GPTNeoXForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "EleutherAI/pythia-14m"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = GPTNeoXConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = GPTNeoXForCausalLM(config).to(dtype=torch.float16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
new file mode 100644
index 00000000000..88b100da305
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "openai/gpt-oss-20b"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = GptOssConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+    num_local_experts=4,
+    num_experts_per_tok=2,
+)
+model = GptOssForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
new file mode 100644
index 00000000000..41140f1b431
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = LlamaConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = LlamaForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3")
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
new file mode 100644
index 00000000000..8ddf1a3a5cb
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = LlamaConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = LlamaForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3.1")
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
new file mode 100644
index 00000000000..d6396fdc11e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = LlamaConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = LlamaForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3.2")
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
new file mode 100644
index 00000000000..33ed69fd2c4
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.1"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = MistralConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = MistralForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "0.1")
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
new file mode 100644
index 00000000000..1463c60c9ab
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = MistralConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = MistralForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "0.2")
diff --git a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
new file mode 100644
index 00000000000..dd34ffeb4ca
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "facebook/opt-1.3b"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = OPTConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = OPTForCausalLM(config).to(dtype=torch.float16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
new file mode 100644
index 00000000000..c1e4b8c261e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
@@ -0,0 +1,29 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft import LoraConfig, get_peft_model
+from transformers import GenerationConfig, Qwen3ForCausalLM
+
+from .._common import check_transformers_version, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+BASE = "trl-internal-testing/tiny-Qwen3ForCausalLM"
+
+model = Qwen3ForCausalLM.from_pretrained(BASE, dtype="auto")
+model = get_peft_model(model, LoraConfig())
+generation_config = GenerationConfig.from_pretrained(BASE)
+smoke_test(model, None)
+push_to_hub(model, None, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
new file mode 100644
index 00000000000..1d0da0a62c3
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
@@ -0,0 +1,31 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Same model class as peft_qwen3_for_causal_lm.py, with different (random) LoRA weights.
+
+from peft import LoraConfig, get_peft_model
+from transformers import GenerationConfig, Qwen3ForCausalLM
+
+from .._common import check_transformers_version, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+BASE = "trl-internal-testing/tiny-Qwen3ForCausalLM"
+
+model = Qwen3ForCausalLM.from_pretrained(BASE, dtype="auto")
+model = get_peft_model(model, LoraConfig())
+generation_config = GenerationConfig.from_pretrained(BASE)
+smoke_test(model, None)
+push_to_hub(model, None, generation_config, "tiny", "2")
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
new file mode 100644
index 00000000000..3dbe53eb51b
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Phi3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Phi3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3")
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
new file mode 100644
index 00000000000..9685f638b9e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Phi3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Phi3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3.5")
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
new file mode 100644
index 00000000000..81ca31f7957
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2.5-32B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "2.5")
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
new file mode 100644
index 00000000000..18af3689ef5
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
@@ -0,0 +1,40 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "2.5-Coder")
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
new file mode 100644
index 00000000000..cea498065e7
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-8B"
+# Revision pins the chat template PR with `{% generation %}` support.
+REVISION = "refs/pr/14"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=REVISION)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID, revision=REVISION)
+config = Qwen3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
new file mode 100644
index 00000000000..96f7e74059f
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen3MoeConfig(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+    num_experts=4,
+    num_experts_per_tok=2,
+)
+model = Qwen3MoeForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
new file mode 100644
index 00000000000..a19f39a4cf4
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
@@ -0,0 +1,41 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2.5-32B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen2Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=128,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "small", "2.5")
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
new file mode 100644
index 00000000000..2af5ba70df3
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
@@ -0,0 +1,41 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-4B"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=128,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "small")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/__init__.py b/scripts/generate_tiny_models/for_conditional_generation/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/scripts/generate_tiny_models/for_conditional_generation/bart_model.py b/scripts/generate_tiny_models/for_conditional_generation/bart_model.py
new file mode 100644
index 00000000000..aa180d5e119
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/bart_model.py
@@ -0,0 +1,32 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, BartModel
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "facebook/bart-base"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(MODEL_ID)
+config.d_model = 24
+model = BartModel(config).to(dtype=torch.float32)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, None, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
new file mode 100644
index 00000000000..7c928061f61
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# python -m scripts.generate_tiny_models.for_conditional_generation.gemma3_for_conditional_generation
+
+import torch
+from transformers import AutoConfig, AutoProcessor, Gemma3ForConditionalGeneration, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/gemma-3-4b-it"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,  # Set it automatically from num_hidden_layers
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = Gemma3ForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
new file mode 100644
index 00000000000..23d1ddbeed0
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
@@ -0,0 +1,60 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gemma4 rope validation fails when passing text_config as a dict through AutoConfig,
+# so the config is loaded first and then mutated in place.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, Gemma4ForConditionalGeneration, GenerationConfig
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "5.6.0"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/gemma-4-E2B-it"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID)
+for k, v in text_config.items():
+    setattr(config.text_config, k, v)
+for k, v in vision_config.items():
+    setattr(config.vision_config, k, v)
+config.text_config.layer_types = ["sliding_attention", "full_attention"]
+config.text_config.num_kv_shared_layers = 0
+config.text_config.global_head_dim = 8
+config.text_config.hidden_size_per_layer_input = 16
+config.audio_config = None
+
+model = Gemma4ForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
new file mode 100644
index 00000000000..74b61fac6bc
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
@@ -0,0 +1,53 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Idefics2ForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "HuggingFaceM4/idefics2-8b"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    text_config=text_config,
+    vision_config=vision_config,
+    perceiver_config={"hidden_size": 16},
+)
+model = Idefics2ForConditionalGeneration(config).to(dtype=torch.float32)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
new file mode 100644
index 00000000000..fd6e71c3cc0
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Idefics3ForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = Idefics3ForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
new file mode 100644
index 00000000000..de9ef6b6448
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, InternVLForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "OpenGVLab/InternVL3-8B-hf"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = InternVLForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
new file mode 100644
index 00000000000..cbc404ed11e
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, LlavaForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "llava-hf/llava-1.5-7b-hf"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = LlavaForConditionalGeneration(config).to(dtype=torch.float16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
new file mode 100644
index 00000000000..f4505b885bb
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
@@ -0,0 +1,54 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Original model dtype is float16, but it triggers CUDA device-side assert on generation (see GH-4741),
+# so this tiny model is saved in bfloat16.
+# Upstream hotfix: llava-hf/llava-v1.6-mistral-7b-hf mistakenly sets text_config.dtype to "bfloat16"
+# (see https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/discussions/46), which we clear here.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, LlavaNextForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+    "dtype": None,  # hotfix for upstream text_config.dtype = "bfloat16"
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = LlavaNextForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
new file mode 100644
index 00000000000..9d2f528f033
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, PaliGemmaForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/paligemma-3b-pt-224"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = PaliGemmaForConditionalGeneration(config).to(dtype=torch.float32)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
new file mode 100644
index 00000000000..0f6fa35ad80
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
@@ -0,0 +1,62 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: Qwen2.5-VL requires out_hidden_size on the vision config, plus root-level num_hidden_layers/hidden_size/
+# num_attention_heads (distinct from the text_config fields). See GH-4101 and transformers#41020.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Qwen2_5_VLForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+    "rope_scaling": {"type": "default", "mrope_section": [1, 1], "rope_type": "default"},
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+    "depth": 2,
+    "out_hidden_size": 16,
+}
+
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    text_config=text_config,
+    vision_config=vision_config,
+    rope_scaling={"type": "default", "mrope_section": [1, 1], "rope_type": "default"},
+    num_hidden_layers=2,
+    hidden_size=16,
+    num_attention_heads=4,
+)
+model = Qwen2_5_VLForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
new file mode 100644
index 00000000000..c524ddaed18
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
@@ -0,0 +1,57 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: two distinct rope_scaling dict objects (root and text_config). See GH-4101 and transformers#41020.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Qwen2VLForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+    "rope_scaling": {"type": "default", "mrope_section": [1, 1], "rope_type": "default"},
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+    "depth": 2,
+}
+
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    text_config=text_config,
+    vision_config=vision_config,
+    rope_scaling={"type": "default", "mrope_section": [1, 1], "rope_type": "default"},
+)
+model = Qwen2VLForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
new file mode 100644
index 00000000000..54bf25d2882
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
@@ -0,0 +1,62 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Notes:
+# - Qwen3.5 auto-builds layer_types from num_hidden_layers with default interval 4, so tiny models
+#   (2 layers) end up all-linear-attention, which breaks dynamic cache. Force one full-attention layer.
+# - The vision config expects `depth`/`num_heads` (not `num_hidden_layers`/`num_attention_heads`).
+# - Qwen3.5 has no published generation_config on the Hub yet.
+# - Qwen3.5 keeps some linear-attn weights in float32; we cast them back after the bfloat16 conversion.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, Qwen3_5ForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "5.2.0"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3.5-0.8B"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": ["linear_attention", "full_attention"],
+    "full_attention_interval": 2,
+}
+vision_config = {
+    "hidden_size": 16,
+    "depth": 2,
+    "num_heads": 4,
+    "intermediate_size": 32,
+    "out_hidden_size": 16,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = Qwen3_5ForConditionalGeneration(config).to(dtype=torch.bfloat16)
+
+# Restore float32 for linear-attn weights that the upstream model keeps in fp32.
+for layer in model.model.language_model.layers:
+    if hasattr(layer, "linear_attn"):
+        layer.linear_attn.A_log.data = layer.linear_attn.A_log.data.float()
+        layer.linear_attn.norm.weight.data = layer.linear_attn.norm.weight.data.float()
+
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, None, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
new file mode 100644
index 00000000000..794d771ef38
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
@@ -0,0 +1,56 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Notes:
+# - "layer_types" is intentionally omitted from text_config: qwen3_vl's modeling code checks
+#   `hasattr(config, "layer_types")` and uses a different path when absent
+#   (see transformers/models/qwen3_vl/modeling_qwen3_vl.py).
+# - mrope_section needs 3 elements (for dim, offset in enumerate((1, 2), start=1): mrope_section[dim]).
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Qwen3VLForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.57.0"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "rope_scaling": {"mrope_interleaved": True, "mrope_section": [2, 2, 2], "rope_type": "default"},
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+    "depth": 2,
+    "out_hidden_size": 16,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = Qwen3VLForConditionalGeneration(config).to(dtype=torch.bfloat16)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
new file mode 100644
index 00000000000..88433f4f971
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
@@ -0,0 +1,48 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, SmolVLMForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": None,
+}
+vision_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = SmolVLMForConditionalGeneration(config).to(dtype=torch.float32)
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
new file mode 100644
index 00000000000..4f19ef21f66
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
@@ -0,0 +1,33 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, T5ForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "google/flan-t5-small"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(MODEL_ID)
+config.d_model = 24
+model = T5ForConditionalGeneration(config).to(dtype=torch.float32)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_sequence_classification/__init__.py b/scripts/generate_tiny_models/for_sequence_classification/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
new file mode 100644
index 00000000000..ede81cdf693
--- /dev/null
+++ b/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
@@ -0,0 +1,41 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPTNeoXForSequenceClassification
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "EleutherAI/pythia-14m"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    num_labels=1,
+    hidden_size=16,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = GPTNeoXForSequenceClassification(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py b/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
new file mode 100644
index 00000000000..4dede5dff74
--- /dev/null
+++ b/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
@@ -0,0 +1,41 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaForSequenceClassification
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    num_labels=1,
+    hidden_size=16,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = LlamaForSequenceClassification(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "3.2")
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py b/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
new file mode 100644
index 00000000000..33ebaf58ef1
--- /dev/null
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
@@ -0,0 +1,43 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2ForSequenceClassification
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen2.5-32B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    num_labels=1,
+    hidden_size=16,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+# Bug in transformers: it ignores num_hidden_layers to build layer_types
+config.layer_types = config.layer_types[:2]
+model = Qwen2ForSequenceClassification(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "2.5")
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
new file mode 100644
index 00000000000..e48a7296950
--- /dev/null
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
@@ -0,0 +1,43 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3ForSequenceClassification
+
+from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-4B"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    num_labels=1,
+    hidden_size=16,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+# Bug in transformers: it ignores num_hidden_layers to build layer_types
+config.layer_types = config.layer_types[:2]
+model = Qwen3ForSequenceClassification(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py
new file mode 100644
index 00000000000..c6f829dc8c7
--- /dev/null
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py
@@ -0,0 +1,42 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3MoeForSequenceClassification
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = AutoConfig.from_pretrained(
+    MODEL_ID,
+    num_labels=1,
+    hidden_size=16,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+    num_experts=4,
+    num_experts_per_tok=2,
+)
+model = Qwen3MoeForSequenceClassification(config).to(dtype=torch.bfloat16)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny")
diff --git a/tests/conftest.py b/tests/conftest.py
index f071b789ffd..4008a4d9d0b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,6 +37,7 @@
 
 MODEL_REVISIONS = {
     # Add model_id: revision mappings here to test PRs
+    "trl-internal-testing/tiny-Gemma3ForConditionalGeneration": "refs/pr/7",
 }
 
 
From a060e6df799484827b5c0c6bfd463677564ca815 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 18:51:07 +0000
Subject: [PATCH 02/20] cohere and fix vocab size

---
 .../for_causal_lm/cohere2_for_causal_lm.py                   | 4 ++--
 .../for_causal_lm/cohere_for_causal_lm.py                    | 5 +++--
 .../for_causal_lm/deepseek_v3_for_causal_lm.py               | 4 ++--
 .../for_causal_lm/deepseek_v3_for_causal_lm_0528.py          | 4 ++--
 .../for_causal_lm/falcon_mamba_for_causal_lm.py              | 4 ++--
 .../for_causal_lm/gemma2_for_causal_lm.py                    | 4 ++--
 .../for_causal_lm/gemma_for_causal_lm.py                     | 4 ++--
 .../for_causal_lm/glm4_moe_for_causal_lm.py                  | 4 ++--
 .../generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py | 4 ++--
 .../for_causal_lm/gpt_neox_for_causal_lm.py                  | 4 ++--
 .../for_causal_lm/gpt_oss_for_causal_lm.py                   | 4 ++--
 .../for_causal_lm/llama_for_causal_lm_3.py                   | 4 ++--
 .../for_causal_lm/llama_for_causal_lm_3_1.py                 | 4 ++--
 .../for_causal_lm/llama_for_causal_lm_3_2.py                 | 4 ++--
 .../for_causal_lm/mistral_for_causal_lm_0_1.py               | 4 ++--
 .../for_causal_lm/mistral_for_causal_lm_0_2.py               | 4 ++--
 .../generate_tiny_models/for_causal_lm/opt_for_causal_lm.py  | 4 ++--
 .../for_causal_lm/phi3_for_causal_lm_3.py                    | 4 ++--
 .../for_causal_lm/phi3_for_causal_lm_3_5.py                  | 4 ++--
 .../for_causal_lm/qwen2_for_causal_lm_2_5.py                 | 4 ++--
 .../for_causal_lm/qwen2_for_causal_lm_2_5_coder.py           | 4 ++--
 .../for_causal_lm/qwen3_for_causal_lm.py                     | 4 ++--
 .../for_causal_lm/qwen3_moe_for_causal_lm.py                 | 4 ++--
 .../for_causal_lm/small_qwen2_for_causal_lm_2_5.py           | 4 ++--
 .../for_causal_lm/small_qwen3_for_causal_lm.py               | 4 ++--
 .../gemma3_for_conditional_generation.py                     | 2 --
 tests/conftest.py                                            | 1 +
 27 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
index fe1d72eae89..82e11dc84a8 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Cohere2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
index 48336e33ecc..c083d083ba5 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,12 +25,13 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = CohereConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
     num_hidden_layers=2,
     intermediate_size=32,
+    logit_scale=0.125,
 )
 model = CohereForCausalLM(config).to(dtype=torch.float16)
 init_weights_tiny_model(model)
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
index 0c29cffbae1..fa4274fb70a 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = DeepseekV3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
index a29bd6cec0e..bd852cfe16d 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
@@ -15,7 +15,7 @@
 # Note: R1-0528 is kept in addition to R1 because it has a different chat template.
 
 import torch
-from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -27,7 +27,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = DeepseekV3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
index 502bdd35831..a588f131bd2 100644
--- a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = FalconMambaConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
index 3d96bd09a9a..4e5f41aff9f 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Gemma2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
index b391e48473a..2c3944eea67 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GemmaConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
index c07297fd00e..658c0ff9bf9 100644
--- a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Glm4MoeConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
index 758841b0b33..44fb9dbad35 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, GPT2Config, GPT2LMHeadModel
+from transformers import AutoConfig, AutoTokenizer, GPT2Config, GPT2LMHeadModel, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GPT2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
index f2eed955e9f..608f377669b 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, GPTNeoXConfig, GPTNeoXForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GPTNeoXConfig, GPTNeoXForCausalLM, GenerationConfig
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GPTNeoXConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
index 88b100da305..599c14e7f56 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GptOssConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
index 41140f1b431..e24e0958804 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
index 8ddf1a3a5cb..071c72b7cc0 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
index d6396fdc11e..7691e69d318 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
index 33ed69fd2c4..1c7bf17a370 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = MistralConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
index 1463c60c9ab..638b86df193 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = MistralConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
index dd34ffeb4ca..a8b64038f28 100644
--- a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = OPTConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
index 3dbe53eb51b..982e802e09f 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Phi3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
index 9685f638b9e..23e367b8040 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Phi3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
index 81ca31f7957..84f168180ce 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
index 18af3689ef5..6ada52fea5d 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
index cea498065e7..ebb45674ac2 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -27,7 +27,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=REVISION)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID, revision=REVISION)
 config = Qwen3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
index 96f7e74059f..3e42172015f 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
@@ -25,7 +25,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen3MoeConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
index a19f39a4cf4..225c4be0587 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
@@ -15,7 +15,7 @@
 # Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
@@ -27,7 +27,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=128,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
index 2af5ba70df3..599c857d64b 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
@@ -15,7 +15,7 @@
 # Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
 
 import torch
-from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
@@ -27,7 +27,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen3Config(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
     hidden_size=128,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
index 7c928061f61..050865d27ff 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# python -m scripts.generate_tiny_models.for_conditional_generation.gemma3_for_conditional_generation
-
 import torch
 from transformers import AutoConfig, AutoProcessor, Gemma3ForConditionalGeneration, GenerationConfig
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4008a4d9d0b..921ae910cda 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,6 +37,7 @@
 
 MODEL_REVISIONS = {
     # Add model_id: revision mappings here to test PRs
+    "trl-internal-testing/tiny-CohereForCausalLM": "refs/pr/1",
     "trl-internal-testing/tiny-Gemma3ForConditionalGeneration": "refs/pr/7",
 }
 

From 158b891a198f667b9396838da7c76bbf285b19e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 18:51:14 +0000
Subject: [PATCH 03/20] print pr

---
 scripts/generate_tiny_models/_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 1f713853f59..96ef12b617b 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -216,12 +216,14 @@ def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, f
             for root, _, files in os.walk(tmpdir)
             for name in files
         ]
-        api.create_commit(
+        commit_info = api.create_commit(
             repo_id=repo_id,
             operations=operations,
             commit_message=f"Upload tiny {model.__class__.__name__}",
             create_pr=exists and create_pr,
         )
+        if commit_info.pr_url:
+            print(f"[push_to_hub] PR opened: {commit_info.pr_url}")
 
 
 def init_weights_tiny_model(model):

From f5eedfb6815e1a2655b0963426a348a125b07e15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 18:55:13 +0000
Subject: [PATCH 04/20] precommit

---
 scripts/generate_tiny_models/__init__.py           | 14 ++++++++++++++
 .../generate_tiny_models/for_causal_lm/__init__.py | 14 ++++++++++++++
 .../for_conditional_generation/__init__.py         | 14 ++++++++++++++
 .../for_sequence_classification/__init__.py        | 14 ++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/scripts/generate_tiny_models/__init__.py b/scripts/generate_tiny_models/__init__.py
index e69de29bb2d..3d26f4482fe 100644
--- a/scripts/generate_tiny_models/__init__.py
+++ b/scripts/generate_tiny_models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/scripts/generate_tiny_models/for_causal_lm/__init__.py b/scripts/generate_tiny_models/for_causal_lm/__init__.py
index e69de29bb2d..3d26f4482fe 100644
--- a/scripts/generate_tiny_models/for_causal_lm/__init__.py
+++ b/scripts/generate_tiny_models/for_causal_lm/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/scripts/generate_tiny_models/for_conditional_generation/__init__.py b/scripts/generate_tiny_models/for_conditional_generation/__init__.py
index e69de29bb2d..3d26f4482fe 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/__init__.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/scripts/generate_tiny_models/for_sequence_classification/__init__.py b/scripts/generate_tiny_models/for_sequence_classification/__init__.py
index e69de29bb2d..3d26f4482fe 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/__init__.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+

From ffbf3b1730e4a742b4f3f4791e3a0507dca21ca6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 18:58:47 +0000
Subject: [PATCH 05/20] precommit

---
 scripts/generate_tiny_models/_common.py           | 15 +++++++++++----
 .../for_causal_lm/cohere2_for_causal_lm.py        | 10 +++++++++-
 .../for_causal_lm/cohere_for_causal_lm.py         | 10 +++++++++-
 .../for_causal_lm/deepseek_v3_for_causal_lm.py    | 10 +++++++++-
 .../deepseek_v3_for_causal_lm_0528.py             | 10 +++++++++-
 .../for_causal_lm/falcon_mamba_for_causal_lm.py   | 10 +++++++++-
 .../for_causal_lm/gemma2_for_causal_lm.py         | 10 +++++++++-
 .../for_causal_lm/gemma_for_causal_lm.py          | 10 +++++++++-
 .../for_causal_lm/glm4_moe_for_causal_lm.py       | 10 +++++++++-
 .../for_causal_lm/gpt2_lm_head_model.py           | 12 ++++++++++--
 .../for_causal_lm/gpt_neox_for_causal_lm.py       | 12 ++++++++++--
 .../for_causal_lm/gpt_oss_for_causal_lm.py        | 10 +++++++++-
 .../for_causal_lm/llama_for_causal_lm_3.py        | 10 +++++++++-
 .../for_causal_lm/llama_for_causal_lm_3_1.py      | 10 +++++++++-
 .../for_causal_lm/llama_for_causal_lm_3_2.py      | 10 +++++++++-
 .../for_causal_lm/mistral_for_causal_lm_0_1.py    | 10 +++++++++-
 .../for_causal_lm/mistral_for_causal_lm_0_2.py    | 10 +++++++++-
 .../for_causal_lm/opt_for_causal_lm.py            | 10 +++++++++-
 .../for_causal_lm/peft_qwen3_for_causal_lm.py     |  1 +
 .../for_causal_lm/peft_qwen3_for_causal_lm_2.py   |  1 +
 .../for_causal_lm/phi3_for_causal_lm_3.py         | 10 +++++++++-
 .../for_causal_lm/phi3_for_causal_lm_3_5.py       | 10 +++++++++-
 .../for_causal_lm/qwen2_for_causal_lm_2_5.py      | 10 +++++++++-
 .../qwen2_for_causal_lm_2_5_coder.py              | 10 +++++++++-
 .../for_causal_lm/qwen3_for_causal_lm.py          | 10 +++++++++-
 .../for_causal_lm/qwen3_moe_for_causal_lm.py      | 10 +++++++++-
 .../small_qwen2_for_causal_lm_2_5.py              |  1 +
 .../for_causal_lm/small_qwen3_for_causal_lm.py    |  1 +
 .../for_conditional_generation/bart_model.py      |  1 +
 .../gemma3_for_conditional_generation.py          |  1 +
 .../gemma4_for_conditional_generation.py          |  1 +
 .../idefics2_for_conditional_generation.py        |  1 +
 .../idefics3_for_conditional_generation.py        |  1 +
 .../internvl_for_conditional_generation.py        |  1 +
 .../llava_for_conditional_generation.py           |  1 +
 .../llava_next_for_conditional_generation.py      |  1 +
 .../paligemma_for_conditional_generation.py       |  1 +
 .../qwen2_5_vl_for_conditional_generation.py      |  1 +
 .../qwen2_vl_for_conditional_generation.py        |  1 +
 .../qwen3_5_for_conditional_generation.py         |  1 +
 .../qwen3_vl_for_conditional_generation.py        |  1 +
 .../smolvlm_for_conditional_generation.py         |  1 +
 .../t5_for_conditional_generation.py              |  1 +
 .../gpt_neox_for_sequence_classification.py       | 10 +++++++++-
 .../llama_for_sequence_classification_3_2.py      | 10 +++++++++-
 .../qwen2_for_sequence_classification_2_5.py      | 10 +++++++++-
 .../qwen3_for_sequence_classification.py          | 10 +++++++++-
 .../qwen3_moe_for_sequence_classification.py      |  1 +
 48 files changed, 276 insertions(+), 33 deletions(-)

diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 96ef12b617b..900a8ebe6f4 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -50,7 +50,7 @@ def check_transformers_version(expected_version):
 
     if Version(transformers.__version__) != Version(expected_version):
         raise RuntimeError(
-            f"This script requires transformers=={expected_version}, " f"but {transformers.__version__} is installed."
+            f"This script requires transformers=={expected_version}, but {transformers.__version__} is installed."
         )
 
 
@@ -67,7 +67,12 @@ def smoke_test(model, tokenizer_or_processor=None):
         red = Image.new("RGB", (24, 24), color="red")
         blue = Image.new("RGB", (24, 24), color="blue")
         messages = [
-            [{"role": "user", "content": [{"type": "image", "image": red}, {"type": "text", "text": "What is this?"}]}],
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "image", "image": red}, {"type": "text", "text": "What is this?"}],
+                }
+            ],
             [{"role": "user", "content": [{"type": "text", "text": "Is it blue?"}, {"type": "image", "image": blue}]}],
         ]
         inputs = processor.apply_chat_template(
@@ -211,8 +216,10 @@ def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, f
         model_card.save(os.path.join(tmpdir, "README.md"))
 
         operations = [
-            CommitOperationAdd(path_in_repo=os.path.relpath(os.path.join(root, name), tmpdir),
-                               path_or_fileobj=os.path.join(root, name))
+            CommitOperationAdd(
+                path_in_repo=os.path.relpath(os.path.join(root, name), tmpdir),
+                path_or_fileobj=os.path.join(root, name),
+            )
             for root, _, files in os.walk(tmpdir)
             for name in files
         ]
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
index 82e11dc84a8..f1508fe8b57 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
index c083d083ba5..6731ff52838 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
index fa4274fb70a..d480a516606 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
index bd852cfe16d..9fdb50c2d90 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
@@ -17,7 +17,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
index a588f131bd2..f93f1653d5a 100644
--- a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
index 4e5f41aff9f..ce38bcc8ae1 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
index 2c3944eea67..bdd85a572cc 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
index 658c0ff9bf9..c96fbc1b89e 100644
--- a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
index 44fb9dbad35..a91c36aae73 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
@@ -13,9 +13,17 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GPT2Config, GPT2LMHeadModel, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPT2Config, GPT2LMHeadModel
+
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
index 608f377669b..18bc7d12956 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
@@ -13,9 +13,17 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GPTNeoXConfig, GPTNeoXForCausalLM, GenerationConfig
+from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPTNeoXConfig, GPTNeoXForCausalLM
+
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
index 599c14e7f56..270adc826e5 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
index e24e0958804..f3808e8d992 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
index 071c72b7cc0..ed9e607c9f7 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
index 7691e69d318..e4285e892b3 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
index 1c7bf17a370..061c5d27cfa 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
index 638b86df193..6c477fd0922 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
index a8b64038f28..817223bad05 100644
--- a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
index c1e4b8c261e..7f647facaf8 100644
--- a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
@@ -17,6 +17,7 @@
 
 from .._common import check_transformers_version, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
index 1d0da0a62c3..08fb10ce0a9 100644
--- a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
@@ -19,6 +19,7 @@
 
 from .._common import check_transformers_version, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
index 982e802e09f..87ab1a30db2 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
index 23e367b8040..0b6ce00bbdc 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
index 84f168180ce..0198bb05ed8 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
index 6ada52fea5d..908fc0692b1 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
index ebb45674ac2..84ff6176136 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
index 3e42172015f..0f24e3175c6 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
index 225c4be0587..5556958abce 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
@@ -19,6 +19,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
index 599c857d64b..2b3840ff535 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
@@ -19,6 +19,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/bart_model.py b/scripts/generate_tiny_models/for_conditional_generation/bart_model.py
index aa180d5e119..c754515af72 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/bart_model.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/bart_model.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
index 050865d27ff..1cc3f2f361d 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
index 23d1ddbeed0..8d3cba21904 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
@@ -20,6 +20,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "5.6.0"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
index 74b61fac6bc..4e6dd711aff 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/idefics2_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
index fd6e71c3cc0..a24909ab7d8 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/idefics3_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
index de9ef6b6448..76ec81ca4ea 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/internvl_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
index cbc404ed11e..1a7ecf4b8cb 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/llava_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
index f4505b885bb..875fe0545da 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/llava_next_for_conditional_generation.py
@@ -22,6 +22,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
index 9d2f528f033..5b5fa67ec9d 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/paligemma_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
index 0f6fa35ad80..2eab8826124 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen2_5_vl_for_conditional_generation.py
@@ -20,6 +20,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
index c524ddaed18..b3fdf792418 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen2_vl_for_conditional_generation.py
@@ -19,6 +19,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
index 54bf25d2882..2caa177b742 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
@@ -24,6 +24,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "5.2.0"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
index 794d771ef38..f04fb4cfadf 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_vl_for_conditional_generation.py
@@ -23,6 +23,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.57.0"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
index 88433f4f971..b019e6d084b 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/smolvlm_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
index 4f19ef21f66..451e7b5ecc9 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/t5_for_conditional_generation.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 
diff --git a/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
index ede81cdf693..c0e270ad050 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/gpt_neox_for_sequence_classification.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPTNeoXForSequenceClassification
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py b/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
index 4dede5dff74..e082ed94656 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/llama_for_sequence_classification_3_2.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaForSequenceClassification
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py b/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
index 33ebaf58ef1..6b83cf4204f 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen2_for_sequence_classification_2_5.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2ForSequenceClassification
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
index e48a7296950..fa05dcc1105 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen3_for_sequence_classification.py
@@ -15,7 +15,15 @@
 import torch
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3ForSequenceClassification
 
-from .._common import check_dtype_pattern, check_transformers_version, init_weights_tiny_model, print_config_diff, push_to_hub, smoke_test
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
 
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
diff --git a/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py b/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py
index c6f829dc8c7..b89842afbaa 100644
--- a/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py
+++ b/scripts/generate_tiny_models/for_sequence_classification/qwen3_moe_for_sequence_classification.py
@@ -17,6 +17,7 @@
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
+
 TRANSFORMERS_VERSION = "4.56.2"
 check_transformers_version(TRANSFORMERS_VERSION)
 

From d24a76c87001864fe13e1c3da17c1af82ae6f5f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 19:12:45 +0000
Subject: [PATCH 06/20] cohere2

---
 .../for_causal_lm/cohere2_for_causal_lm.py        | 15 +++++++++++++++
 tests/conftest.py                                 |  1 +
 2 files changed, 16 insertions(+)

diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
index f1508fe8b57..0b3c779546a 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
@@ -39,6 +39,21 @@
     num_key_value_heads=2,
     num_hidden_layers=2,
     intermediate_size=32,
+    bos_token_id=2,
+    eos_token_id=3,
+    logit_scale=1.0,
+    max_position_embeddings=500000,
+    rope_theta=50000,
+    cache_implementation="hybrid",
+    layer_switch=4,
+    order_of_interleaved_layers="local_attn_first",
+    position_embedding_type="rope_gptj",
+    rotary_pct=1.0,
+    use_embedding_sharing=True,
+    use_gated_activation=True,
+    use_parallel_block=True,
+    use_parallel_embedding=False,
+    use_qk_norm=False,
 )
 model = Cohere2ForCausalLM(config).to(dtype=torch.bfloat16)
 init_weights_tiny_model(model)
diff --git a/tests/conftest.py b/tests/conftest.py
index 921ae910cda..402c14f635b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -38,6 +38,7 @@
 MODEL_REVISIONS = {
     # Add model_id: revision mappings here to test PRs
     "trl-internal-testing/tiny-CohereForCausalLM": "refs/pr/1",
+    "trl-internal-testing/tiny-Cohere2ForCausalLM": "refs/pr/1",
     "trl-internal-testing/tiny-Gemma3ForConditionalGeneration": "refs/pr/7",
 }
 

From f0f5563e5ca61adc98a7eb5332c0f5c99c261334 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 19:58:29 +0000
Subject: [PATCH 07/20] deepseek v3

---
 scripts/generate_tiny_models/_common.py       |  2 +
 .../deepseek_v3_for_causal_lm.py              | 64 +++++++++++++++++--
 tests/conftest.py                             |  1 +
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 900a8ebe6f4..914df2bd9e4 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -118,6 +118,8 @@ def _flatten(d, prefix=""):
     torch.float16: "F16",
     torch.bfloat16: "BF16",
     torch.float64: "F64",
+    torch.float8_e4m3fn: "F8_E4M3",
+    torch.float8_e5m2: "F8_E5M2",
     torch.int8: "I8",
     torch.int16: "I16",
     torch.int32: "I32",
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
index d480a516606..6e5b3e1a55a 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
@@ -12,8 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
+
 import torch
-from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    DeepseekV3Config,
+    DeepseekV3ForCausalLM,
+    FineGrainedFP8Config,
+    GenerationConfig,
+)
 
 from .._common import (
     check_dtype_pattern,
@@ -32,16 +41,61 @@
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+# DeepSeek-R1 uses weight_block_size=[128,128] upstream. We use [32,32] for the tiny so that smaller
+# hidden dims still tile cleanly (every projection dim divisible by 32, ≥ 2 blocks per dim to avoid
+# a scalar weight_scale_inv shape). Trade-off: drops out of the DeepGEMM fast path onto Triton; fine
+# for a tiny used in tests.
 config = DeepseekV3Config(
     vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
-    hidden_size=8,
+    hidden_size=64,
     num_attention_heads=4,
     num_key_value_heads=2,
     num_hidden_layers=2,
-    intermediate_size=32,
+    intermediate_size=64,
+    max_position_embeddings=163840,
+    rope_scaling={
+        "beta_fast": 32.0,
+        "beta_slow": 1.0,
+        "factor": 40.0,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "original_max_position_embeddings": 4096,
+        "rope_type": "yarn",
+        "type": "yarn",
+    },
+    ep_size=1,
+    moe_layer_freq=1,
+    num_nextn_predict_layers=1,
+    scoring_func="sigmoid",
+    topk_method="noaux_tc",
 )
-model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16)
-init_weights_tiny_model(model)
+
+# Build a random bf16 model, then round-trip through disk with `quantization_config=FineGrainedFP8Config(...)`
+# so transformers' FP8 quantizer rewrites Linear layers to FP8Linear and writes FP8 weights + scales.
+# Needs a GPU with compute capability >= 8.9 (H100+); on older hardware the quantizer auto-dequantizes to bf16.
+with tempfile.TemporaryDirectory() as tmpdir:
+    bf16_model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16, device="cuda")
+    init_weights_tiny_model(bf16_model)
+    bf16_model.save_pretrained(tmpdir)
+    tokenizer.save_pretrained(tmpdir)
+    del bf16_model
+    torch.cuda.empty_cache()
+
+    quantization_config = FineGrainedFP8Config(activation_scheme="dynamic", weight_block_size=[32, 32])
+    model = DeepseekV3ForCausalLM.from_pretrained(
+        tmpdir,
+        quantization_config=quantization_config,
+        dtype=torch.bfloat16,
+        device_map="cuda",
+    )
+
+# `dtype=torch.bfloat16` casts the whole model, including the FP32 per-block scales the quantizer
+# created. Restore them to FP32 to match the reference's dtype pattern.
+for module in model.modules():
+    if hasattr(module, "weight_scale_inv") and module.weight_scale_inv is not None:
+        module.weight_scale_inv.data = module.weight_scale_inv.data.float()
+
 smoke_test(model, tokenizer)
 check_dtype_pattern(MODEL_ID, model)
 print_config_diff(MODEL_ID, model)
diff --git a/tests/conftest.py b/tests/conftest.py
index 402c14f635b..10a602d55d7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -39,6 +39,7 @@
     # Add model_id: revision mappings here to test PRs
     "trl-internal-testing/tiny-CohereForCausalLM": "refs/pr/1",
     "trl-internal-testing/tiny-Cohere2ForCausalLM": "refs/pr/1",
+    "trl-internal-testing/tiny-DeepseekV3ForCausalLM": "refs/pr/2",
     "trl-internal-testing/tiny-Gemma3ForConditionalGeneration": "refs/pr/7",
 }
 

From 59cb16e8724f523d1c86b137d54e2847faefc024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:21:00 +0000
Subject: [PATCH 08/20] revert to keep this focused

---
 .../for_causal_lm/cohere2_for_causal_lm.py    | 19 +-----
 .../for_causal_lm/cohere_for_causal_lm.py     |  4 +-
 .../deepseek_v3_for_causal_lm.py              | 66 ++-----------------
 .../deepseek_v3_for_causal_lm_0528.py         |  4 +-
 .../falcon_mamba_for_causal_lm.py             |  4 +-
 .../for_causal_lm/gemma2_for_causal_lm.py     |  4 +-
 .../for_causal_lm/gemma_for_causal_lm.py      |  4 +-
 .../for_causal_lm/glm4_moe_for_causal_lm.py   |  4 +-
 .../for_causal_lm/gpt2_lm_head_model.py       |  4 +-
 .../for_causal_lm/gpt_neox_for_causal_lm.py   |  4 +-
 .../for_causal_lm/gpt_oss_for_causal_lm.py    |  4 +-
 .../for_causal_lm/llama_for_causal_lm_3.py    |  4 +-
 .../for_causal_lm/llama_for_causal_lm_3_1.py  |  4 +-
 .../for_causal_lm/llama_for_causal_lm_3_2.py  |  4 +-
 .../mistral_for_causal_lm_0_1.py              |  4 +-
 .../mistral_for_causal_lm_0_2.py              |  4 +-
 .../for_causal_lm/opt_for_causal_lm.py        |  4 +-
 .../for_causal_lm/phi3_for_causal_lm_3.py     |  4 +-
 .../for_causal_lm/phi3_for_causal_lm_3_5.py   |  4 +-
 .../for_causal_lm/qwen2_for_causal_lm_2_5.py  |  4 +-
 .../qwen2_for_causal_lm_2_5_coder.py          |  4 +-
 .../for_causal_lm/qwen3_for_causal_lm.py      |  4 +-
 .../for_causal_lm/qwen3_moe_for_causal_lm.py  |  4 +-
 .../small_qwen2_for_causal_lm_2_5.py          |  4 +-
 .../small_qwen3_for_causal_lm.py              |  4 +-
 .../gemma3_for_conditional_generation.py      |  2 +
 tests/conftest.py                             |  4 --
 27 files changed, 56 insertions(+), 127 deletions(-)

diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
index 0b3c779546a..6632bbb13b0 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere2_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, Cohere2Config, Cohere2ForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -33,27 +33,12 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Cohere2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
     num_hidden_layers=2,
     intermediate_size=32,
-    bos_token_id=2,
-    eos_token_id=3,
-    logit_scale=1.0,
-    max_position_embeddings=500000,
-    rope_theta=50000,
-    cache_implementation="hybrid",
-    layer_switch=4,
-    order_of_interleaved_layers="local_attn_first",
-    position_embedding_type="rope_gptj",
-    rotary_pct=1.0,
-    use_embedding_sharing=True,
-    use_gated_activation=True,
-    use_parallel_block=True,
-    use_parallel_embedding=False,
-    use_qk_norm=False,
 )
 model = Cohere2ForCausalLM(config).to(dtype=torch.bfloat16)
 init_weights_tiny_model(model)
diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
index 6731ff52838..e561ed1474f 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, CohereConfig, CohereForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = CohereConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
index 6e5b3e1a55a..fe13290ecfa 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm.py
@@ -12,17 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
-
 import torch
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    DeepseekV3Config,
-    DeepseekV3ForCausalLM,
-    FineGrainedFP8Config,
-    GenerationConfig,
-)
+from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -41,61 +32,16 @@
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
-
-# DeepSeek-R1 uses weight_block_size=[128,128] upstream. We use [32,32] for the tiny so that smaller
-# hidden dims still tile cleanly (every projection dim divisible by 32, ≥ 2 blocks per dim to avoid
-# a scalar weight_scale_inv shape). Trade-off: drops out of the DeepGEMM fast path onto Triton; fine
-# for a tiny used in tests.
 config = DeepseekV3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
-    hidden_size=64,
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
     num_hidden_layers=2,
-    intermediate_size=64,
-    max_position_embeddings=163840,
-    rope_scaling={
-        "beta_fast": 32.0,
-        "beta_slow": 1.0,
-        "factor": 40.0,
-        "mscale": 1.0,
-        "mscale_all_dim": 1.0,
-        "original_max_position_embeddings": 4096,
-        "rope_type": "yarn",
-        "type": "yarn",
-    },
-    ep_size=1,
-    moe_layer_freq=1,
-    num_nextn_predict_layers=1,
-    scoring_func="sigmoid",
-    topk_method="noaux_tc",
+    intermediate_size=32,
 )
-
-# Build a random bf16 model, then round-trip through disk with `quantization_config=FineGrainedFP8Config(...)`
-# so transformers' FP8 quantizer rewrites Linear layers to FP8Linear and writes FP8 weights + scales.
-# Needs a GPU with compute capability >= 8.9 (H100+); on older hardware the quantizer auto-dequantizes to bf16.
-with tempfile.TemporaryDirectory() as tmpdir:
-    bf16_model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16, device="cuda")
-    init_weights_tiny_model(bf16_model)
-    bf16_model.save_pretrained(tmpdir)
-    tokenizer.save_pretrained(tmpdir)
-    del bf16_model
-    torch.cuda.empty_cache()
-
-    quantization_config = FineGrainedFP8Config(activation_scheme="dynamic", weight_block_size=[32, 32])
-    model = DeepseekV3ForCausalLM.from_pretrained(
-        tmpdir,
-        quantization_config=quantization_config,
-        dtype=torch.bfloat16,
-        device_map="cuda",
-    )
-
-# `dtype=torch.bfloat16` casts the whole model, including the FP32 per-block scales the quantizer
-# created. Restore them to FP32 to match the reference's dtype pattern.
-for module in model.modules():
-    if hasattr(module, "weight_scale_inv") and module.weight_scale_inv is not None:
-        module.weight_scale_inv.data = module.weight_scale_inv.data.float()
-
+model = DeepseekV3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
 smoke_test(model, tokenizer)
 check_dtype_pattern(MODEL_ID, model)
 print_config_diff(MODEL_ID, model)
diff --git a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
index 9fdb50c2d90..13db90d36f7 100644
--- a/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
+++ b/scripts/generate_tiny_models/for_causal_lm/deepseek_v3_for_causal_lm_0528.py
@@ -15,7 +15,7 @@
 # Note: R1-0528 is kept in addition to R1 because it has a different chat template.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, DeepseekV3Config, DeepseekV3ForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -35,7 +35,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = DeepseekV3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
index f93f1653d5a..77133708ac1 100644
--- a/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/falcon_mamba_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, FalconMambaConfig, FalconMambaForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = FalconMambaConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
index ce38bcc8ae1..68935533aac 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma2_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, Gemma2Config, Gemma2ForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Gemma2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
index bdd85a572cc..22874adc2f9 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gemma_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
+from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM, GenerationConfig
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GemmaConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
index c96fbc1b89e..b0721795295 100644
--- a/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/glm4_moe_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Glm4MoeConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
index a91c36aae73..8d1eb5103ea 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt2_lm_head_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPT2Config, GPT2LMHeadModel
+from transformers import AutoTokenizer, GenerationConfig, GPT2Config, GPT2LMHeadModel
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GPT2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
index 18bc7d12956..080076f18c8 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_neox_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GPTNeoXConfig, GPTNeoXForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, GPTNeoXConfig, GPTNeoXForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GPTNeoXConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
index 270adc826e5..6ae18272af7 100644
--- a/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/gpt_oss_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, GptOssConfig, GptOssForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = GptOssConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
index f3808e8d992..1622c372870 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
index ed9e607c9f7..cb361901fcf 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
index e4285e892b3..34fda29b5f9 100644
--- a/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/llama_for_causal_lm_3_2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = LlamaConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
index 061c5d27cfa..34615475bf5 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = MistralConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
index 6c477fd0922..aa4a9ce849a 100644
--- a/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/mistral_for_causal_lm_0_2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, MistralConfig, MistralForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = MistralConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
index 817223bad05..cf52a1c7c93 100644
--- a/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/opt_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, OPTConfig, OPTForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = OPTConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
index 87ab1a30db2..edb13a7634b 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Phi3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
index 0b6ce00bbdc..d5816214c0b 100644
--- a/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/phi3_for_causal_lm_3_5.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Phi3Config, Phi3ForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Phi3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
index 0198bb05ed8..4b94615fcaf 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
index 908fc0692b1..b7cce4494ba 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen2_for_causal_lm_2_5_coder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
index 84ff6176136..00afd997aad 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -35,7 +35,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=REVISION)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID, revision=REVISION)
 config = Qwen3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
index 0f24e3175c6..ccf562f3c46 100644
--- a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen3MoeConfig, Qwen3MoeForCausalLM
 
 from .._common import (
     check_dtype_pattern,
@@ -33,7 +33,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen3MoeConfig(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
index 5556958abce..acb0ef4f465 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen2_for_causal_lm_2_5.py
@@ -15,7 +15,7 @@
 # Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen2Config, Qwen2ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
@@ -28,7 +28,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen2Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=128,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
index 2b3840ff535..873fcb9641f 100644
--- a/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/small_qwen3_for_causal_lm.py
@@ -15,7 +15,7 @@
 # Slightly bigger than the "tiny" variant: vLLM requires hidden_size // num_attention_heads = 32.
 
 import torch
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
 
 from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
 
@@ -28,7 +28,7 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen3Config(
-    vocab_size=AutoConfig.from_pretrained(MODEL_ID).vocab_size,
+    vocab_size=len(tokenizer.vocab),
     hidden_size=128,
     num_attention_heads=4,
     num_key_value_heads=2,
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
index 1cc3f2f361d..2f85c68abe8 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/gemma3_for_conditional_generation.py
@@ -37,6 +37,8 @@
     "num_hidden_layers": 2,
     "hidden_size": 16,
     "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "embed_dim": 64,
 }
 
 config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
diff --git a/tests/conftest.py b/tests/conftest.py
index 10a602d55d7..f071b789ffd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,10 +37,6 @@
 
 MODEL_REVISIONS = {
     # Add model_id: revision mappings here to test PRs
-    "trl-internal-testing/tiny-CohereForCausalLM": "refs/pr/1",
-    "trl-internal-testing/tiny-Cohere2ForCausalLM": "refs/pr/1",
-    "trl-internal-testing/tiny-DeepseekV3ForCausalLM": "refs/pr/2",
-    "trl-internal-testing/tiny-Gemma3ForConditionalGeneration": "refs/pr/7",
 }
 
 
From 9bc6ad4f6234cc103a09c618a81c77c9ddf2c09f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:26:52 +0000
Subject: [PATCH 09/20] nit

---
 scripts/generate_tiny_models/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generate_tiny_models/README.md b/scripts/generate_tiny_models/README.md
index 14d96a793c7..35a2b8fa9fb 100644
--- a/scripts/generate_tiny_models/README.md
+++ b/scripts/generate_tiny_models/README.md
@@ -6,7 +6,7 @@ This directory contains one script per tiny model used by the TRL test suite. Ea
 
 ```
 generate_tiny_models/
-├── _common.py                               # shared helpers (push_to_hub, smoke_test, print_config_diff, ...)
+├── _common.py                    # shared helpers (push_to_hub, smoke_test, print_config_diff, ...)
 ├── for_causal_lm/                # *ForCausalLM + GPT-2 LM head + small/PEFT variants
 ├── for_sequence_classification/  # *ForSequenceClassification (reward models)
 └── for_conditional_generation/   # *ForConditionalGeneration (VLMs + T5 + Bart encoder-decoder)

From a7ad64a13f88c9229ec9818154df5a436d8371fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:28:15 +0000
Subject: [PATCH 10/20] revert

---
 .../generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
index e561ed1474f..daafb1ec187 100644
--- a/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/cohere_for_causal_lm.py
@@ -39,7 +39,6 @@
     num_key_value_heads=2,
     num_hidden_layers=2,
     intermediate_size=32,
-    logit_scale=0.125,
 )
 model = CohereForCausalLM(config).to(dtype=torch.float16)
 init_weights_tiny_model(model)

From 6b361e1c61d22fd302d8af27eac80705a7f2998c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:29:23 +0000
Subject: [PATCH 11/20] revove force and update readme

---
 scripts/generate_tiny_models/README.md  | 13 ++++++++++---
 scripts/generate_tiny_models/_common.py |  4 ++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/scripts/generate_tiny_models/README.md b/scripts/generate_tiny_models/README.md
index 35a2b8fa9fb..644039837ac 100644
--- a/scripts/generate_tiny_models/README.md
+++ b/scripts/generate_tiny_models/README.md
@@ -25,10 +25,17 @@ Each script:
 1. Checks that the installed `transformers` version matches the one pinned in the script (fails otherwise).
 2. Builds the tiny model with random weights.
 3. Runs `smoke_test` — a minimal forward pass to catch config misspecification and NaNs.
-4. Runs `print_config_diff` — prints every flat-key difference between the reference Hub config and the tiny model's config (for debugging scale-downs).
-5. Pushes the model, tokenizer/processor, generation config, and model card to the Hub.
+4. Runs `check_dtype_pattern` — reads the reference safetensors header via the Hub API and flags any tensor whose dtype diverges from the reference (catches e.g. fp32 norms kept inside a bf16 checkpoint).
+5. Runs `print_config_diff` — prints every flat-key difference between the reference Hub config and the tiny model's config (for debugging scale-downs).
+6. Pushes the model, tokenizer/processor, generation config, and model card to the Hub in a single commit.
 
-If the repo already exists on the Hub, the push is skipped (pass `force=True` in `push_to_hub(...)` to overwrite).
+If the repo already exists on the Hub, the push is skipped by default. Pass `--create-pr` to open a PR against the existing repo instead:
+
+```bash
+python -m scripts.generate_tiny_models.for_causal_lm.qwen3_for_causal_lm --create-pr
+```
+
+Direct overwrites of `main` aren't supported — update via `--create-pr` and merge the PR on the Hub.
 
 ## Version pinning
 
diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 914df2bd9e4..2d72fc84109 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -187,7 +187,7 @@ def _parse_args():
     return args
 
 
-def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, force=False, create_pr=None):
+def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, create_pr=None):
     if create_pr is None:
         create_pr = _parse_args().create_pr
 
@@ -201,7 +201,7 @@ def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, f
         repo_id += f"-{suffix}"
 
     exists = api.repo_exists(repo_id)
-    if exists and not force and not create_pr:
+    if exists and not create_pr:
         print(f"Model {repo_id} already exists, skipping (pass --create-pr to open a PR)")
         return
 

From b2cf6034cc5ef3a1dc7d658fa6031a32483f0afc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:30:52 +0000
Subject: [PATCH 12/20] nit commit message

---
 scripts/generate_tiny_models/_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 2d72fc84109..45f95311e36 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -228,7 +228,7 @@ def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, c
         commit_info = api.create_commit(
             repo_id=repo_id,
             operations=operations,
-            commit_message=f"Upload tiny {model.__class__.__name__}",
+            commit_message=f"Upload {model.__class__.__name__}",
             create_pr=exists and create_pr,
         )
         if commit_info.pr_url:

From b4bae788923612d4a8547b9e664cfa3175df46d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:33:46 +0000
Subject: [PATCH 13/20] better

---
 scripts/generate_tiny_models/_common.py         | 17 +++++++++--------
 .../qwen3_5_for_conditional_generation.py       |  9 +++++----
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/scripts/generate_tiny_models/_common.py b/scripts/generate_tiny_models/_common.py
index 45f95311e36..e9b7c5c7acc 100644
--- a/scripts/generate_tiny_models/_common.py
+++ b/scripts/generate_tiny_models/_common.py
@@ -24,7 +24,7 @@
 from huggingface_hub import CommitOperationAdd, HfApi, ModelCard
 from packaging.version import Version
 from torch import nn
-from transformers import AutoConfig
+from transformers import AutoConfig, ProcessorMixin
 
 
 ORGANIZATION = "trl-internal-testing"
@@ -59,7 +59,7 @@ def smoke_test(model, tokenizer_or_processor=None):
     model.eval()
     device = next(model.parameters()).device
 
-    if tokenizer_or_processor is not None and hasattr(tokenizer_or_processor, "image_processor"):
+    if isinstance(tokenizer_or_processor, ProcessorMixin):
         # VLM path: build a dummy (image, text) input via the processor.
         from PIL import Image
 
@@ -89,14 +89,15 @@ def smoke_test(model, tokenizer_or_processor=None):
     with torch.no_grad():
         out = model(**inputs)
 
-    logits = getattr(out, "logits", None)
-    if logits is None:
-        logits = getattr(out, "last_hidden_state", None)
-    if logits is None:
+    if "logits" in out:
+        output_tensor = out["logits"]
+    elif "last_hidden_state" in out:
+        output_tensor = out["last_hidden_state"]
+    else:
         raise RuntimeError(f"[smoke_test] {model.__class__.__name__}: no logits or last_hidden_state on output")
-    if torch.isnan(logits).any():
+    if torch.isnan(output_tensor).any():
         raise RuntimeError(f"[smoke_test] {model.__class__.__name__}: NaN in forward output")
-    print(f"[smoke_test] {model.__class__.__name__}: OK (output shape {tuple(logits.shape)})")
+    print(f"[smoke_test] {model.__class__.__name__}: OK (output shape {tuple(output_tensor.shape)})")
 
 
 def _flatten(d, prefix=""):
diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
index 2caa177b742..ce0d0dfd0e7 100644
--- a/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_5_for_conditional_generation.py
@@ -52,10 +52,11 @@
 model = Qwen3_5ForConditionalGeneration(config).to(dtype=torch.bfloat16)
 
 # Restore float32 for linear-attn weights that the upstream model keeps in fp32.
-for layer in model.model.language_model.layers:
-    if hasattr(layer, "linear_attn"):
-        layer.linear_attn.A_log.data = layer.linear_attn.A_log.data.float()
-        layer.linear_attn.norm.weight.data = layer.linear_attn.norm.weight.data.float()
+for i, layer_type in enumerate(config.text_config.layer_types):
+    if layer_type == "linear_attention":
+        linear_attn = model.model.language_model.layers[i].linear_attn
+        linear_attn.A_log.data = linear_attn.A_log.data.float()
+        linear_attn.norm.weight.data = linear_attn.norm.weight.data.float()
 
 smoke_test(model, processor)
 check_dtype_pattern(MODEL_ID, model)

From 0b7fa20434c7a61c1c6e7071338548991471374e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 24 Apr 2026 20:52:41 +0000
Subject: [PATCH 14/20] fix generation config peft

---
 .../for_causal_lm/peft_qwen3_for_causal_lm.py                | 5 ++---
 .../for_causal_lm/peft_qwen3_for_causal_lm_2.py              | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
index 7f647facaf8..7688639f873 100644
--- a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from peft import LoraConfig, get_peft_model
-from transformers import GenerationConfig, Qwen3ForCausalLM
+from transformers import Qwen3ForCausalLM
 
 from .._common import check_transformers_version, push_to_hub, smoke_test
 
@@ -25,6 +25,5 @@
 
 model = Qwen3ForCausalLM.from_pretrained(BASE, dtype="auto")
 model = get_peft_model(model, LoraConfig())
-generation_config = GenerationConfig.from_pretrained(BASE)
 smoke_test(model, None)
-push_to_hub(model, None, generation_config, "tiny")
+push_to_hub(model, None, None, "tiny")
diff --git a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
index 08fb10ce0a9..cf84cfd7dee 100644
--- a/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
+++ b/scripts/generate_tiny_models/for_causal_lm/peft_qwen3_for_causal_lm_2.py
@@ -15,7 +15,7 @@
 # Same model class as peft_qwen3_for_causal_lm.py, with different (random) LoRA weights.
 
 from peft import LoraConfig, get_peft_model
-from transformers import GenerationConfig, Qwen3ForCausalLM
+from transformers import Qwen3ForCausalLM
 
 from .._common import check_transformers_version, push_to_hub, smoke_test
 
@@ -27,6 +27,5 @@
 
 model = Qwen3ForCausalLM.from_pretrained(BASE, dtype="auto")
 model = get_peft_model(model, LoraConfig())
-generation_config = GenerationConfig.from_pretrained(BASE)
 smoke_test(model, None)
-push_to_hub(model, None, generation_config, "tiny", "2")
+push_to_hub(model, None, None, "tiny", "2")

From 39bafd43446d9bc051600948a396e9a1f3059d5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Sun, 26 Apr 2026 11:16:48 -0400
Subject: [PATCH 15/20] Qwen3.6 integration (#5642)

---
 docs/source/chat_templates.md             |   6 +-
 docs/source/grpo_trainer.md               |   1 +
 tests/test_chat_template_utils.py         |  18 +++
 tests/test_data_utils.py                  |   7 +
 tests/test_dpo_trainer.py                 |   7 +
 tests/test_grpo_trainer.py                |   7 +
 tests/test_rloo_trainer.py                |   7 +
 tests/test_sft_trainer.py                 |   7 +
 trl/chat_template_utils.py                |  17 ++-
 trl/chat_templates/README.md              |   8 ++
 trl/chat_templates/qwen3_6.jinja          | 154 ++++++++++++++++++++
 trl/chat_templates/qwen3_6_training.jinja | 162 ++++++++++++++++++++++
 12 files changed, 397 insertions(+), 4 deletions(-)
 create mode 100644 trl/chat_templates/qwen3_6.jinja
 create mode 100644 trl/chat_templates/qwen3_6_training.jinja

diff --git a/docs/source/chat_templates.md b/docs/source/chat_templates.md
index 94475e9a29c..964578298df 100644
--- a/docs/source/chat_templates.md
+++ b/docs/source/chat_templates.md
@@ -20,7 +20,7 @@ TRL ships patched templates under [`trl/chat_templates/`](https://github.com/hug
 
 ## Supported model families
 
-TRL stores reference copies of the original templates so it can identify supported models at init and swap in a training template when needed. The following families are recognized: DeepSeek-V3, Gemma, GLM-4-MoE, GPT-OSS, Llama 3 / 3.1 / 3.2, Qwen2.5, Qwen3, Qwen3-VL, Qwen3.5.
+TRL stores reference copies of the original templates so it can identify supported models at init and swap in a training template when needed. The following families are recognized: DeepSeek-V3, Gemma, GLM-4-MoE, GPT-OSS, Llama 3 / 3.1 / 3.2, Qwen2.5, Qwen3, Qwen3-VL, Qwen3.5, Qwen3.6.
 
 ## Training templates
 
@@ -98,6 +98,10 @@ Patched Qwen2.5 template. Diff vs `qwen2_5.jinja`:
 
 Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
 
+### `qwen3_6_training.jinja`
+
+Patched Qwen3.6 template. Diff vs `qwen3_6.jinja`: same set of changes as `qwen3_training.jinja` — require both `<think>` and `</think>` to be present before parsing, drop the `loop.index0 > ns.last_query_index` conditional so the thinking block is always emitted (prefix-preservation), and wrap assistant output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers for SFT assistant-only loss.
+
 ## Related utilities
 
 See [Chat Template Utilities](chat_template_utils) for the helper functions ([`clone_chat_template`], [`is_chat_template_prefix_preserving`], [`get_training_chat_template`]) that operate on these templates.
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index a4c9108c703..98133d1a441 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -751,6 +751,7 @@ Tested with:
 - [**Qwen3**](https://huggingface.co/collections/Qwen/qwen3) — e.g., `Qwen/Qwen3-0.6B`
 - [**Qwen3-VL**](https://huggingface.co/collections/Qwen/qwen3-vl) — e.g., `Qwen/Qwen3-VL-2B-Instruct`
 - [**Qwen3.5**](https://huggingface.co/collections/Qwen/qwen35) — e.g., `Qwen/Qwen3.5-2B`
+- [**Qwen3.6**](https://huggingface.co/collections/Qwen/qwen36) — e.g., `Qwen/Qwen3.6-35B-A3B`
 
 > [!TIP]
 > Compatibility with all LLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
diff --git a/tests/test_chat_template_utils.py b/tests/test_chat_template_utils.py
index d813caec447..9ce97297a3b 100644
--- a/tests/test_chat_template_utils.py
+++ b/tests/test_chat_template_utils.py
@@ -152,6 +152,7 @@ def test_add_response_schema(self, tokenizer_name):
         [
             pytest.param("trl-internal-testing/tiny-Qwen3VLForConditionalGeneration", id="qwen3_vl"),
             pytest.param("trl-internal-testing/tiny-Qwen3_5ForConditionalGeneration", id="qwen35"),
+            pytest.param("trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6", id="qwen36"),
         ],
     )
     def test_add_response_schema_vlm(self, processor_name):
@@ -222,6 +223,14 @@ class TestSupportsToolCalling:
                     reason="Qwen3.5 tokenizer requires transformers>=5.0.0",
                 ),
             ),
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+                id="qwen36",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("5.0.0"),
+                    reason="Qwen3.5 tokenizer requires transformers>=5.0.0",
+                ),
+            ),
         ],
     )
     def test_supports_tool_calling(self, model_id):
@@ -444,6 +453,14 @@ def test_prefix_preserving_template_processor(self):
         pytest.param("trl-internal-testing/tiny-Phi3ForCausalLM-3", id="phi3"),
         pytest.param("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", id="qwen2.5"),
         pytest.param("trl-internal-testing/tiny-Qwen3MoeForCausalLM", id="qwen3"),
+        pytest.param(
+            "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+            id="qwen36",
+            marks=pytest.mark.skipif(
+                Version(transformers.__version__) < Version("5.0.0"),
+                reason="Qwen3.5 tokenizer requires transformers>=5.0.0",
+            ),
+        ),
     ],
 )
 class TestGetTrainingChatTemplate:
@@ -648,6 +665,7 @@ def test_assistant_masks_multi_turn(self, tokenizer_name):
         pytest.param("trl-internal-testing/tiny-Qwen3MoeForCausalLM", id="qwen3"),
         pytest.param("trl-internal-testing/tiny-Qwen3VLForConditionalGeneration", id="qwen3_vl"),
         pytest.param("trl-internal-testing/tiny-Qwen3_5ForConditionalGeneration", id="qwen35"),
+        pytest.param("trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6", id="qwen36"),
         pytest.param(
             "trl-internal-testing/tiny-Gemma4ForConditionalGeneration",
             id="gemma4",
diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
index 8c77ab37e00..3498a8f1850 100644
--- a/tests/test_data_utils.py
+++ b/tests/test_data_utils.py
@@ -553,6 +553,13 @@ class TestApplyChatTemplate(TrlTestCase):
                 reason="Qwen3.5 tokenizer requires transformers>=5.0.0",
             ),
         ),
+        pytest.param(
+            "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+            marks=pytest.mark.skipif(
+                Version(transformers.__version__) < Version("5.0.0"),
+                reason="Qwen3.5 tokenizer requires transformers>=5.0.0",
+            ),
+        ),
     ]
 
     conversational_examples = [
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index ddff0f29cdd..5000c0f449a 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -1051,6 +1051,13 @@ def test_tag_added_peft(self):
                     reason="Qwen3.5 models were introduced in transformers-5.2.0",
                 ),
             ),
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("5.2.0"),
+                    reason="Qwen3.5 models were introduced in transformers-5.2.0",
+                ),
+            ),
         ],
     )
     @require_vision
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index bbb861eec14..8f3e14660d8 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1939,6 +1939,13 @@ def test_prepare_input_called_with_correct_data(self):
                     reason="Qwen3.5 models were introduced in transformers-5.2.0",
                 ),
             ),
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("5.2.0"),
+                    reason="Qwen3.5 models were introduced in transformers-5.2.0",
+                ),
+            ),
             # "trl-internal-testing/tiny-SmolVLMForConditionalGeneration", seems not to support bf16 properly
         ],
     )
diff --git a/tests/test_rloo_trainer.py b/tests/test_rloo_trainer.py
index 39f766ed5a9..1bea5a80d7d 100644
--- a/tests/test_rloo_trainer.py
+++ b/tests/test_rloo_trainer.py
@@ -1331,6 +1331,13 @@ def test_prepare_input_called_with_correct_data(self):
                     reason="Qwen3.5 models were introduced in transformers-5.2.0",
                 ),
             ),
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("5.2.0"),
+                    reason="Qwen3.5 models were introduced in transformers-5.2.0",
+                ),
+            ),
             # "trl-internal-testing/tiny-SmolVLMForConditionalGeneration", seems not to support bf16 properly
         ],
     )
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index 95e06238ce5..2c10d2b7e94 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1660,6 +1660,13 @@ def test_tag_added_peft(self):
                     reason="Qwen3.5 models were introduced in transformers-5.2.0",
                 ),
             ),
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3_5MoeForConditionalGeneration-3.6",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("5.2.0"),
+                    reason="Qwen3.5 models were introduced in transformers-5.2.0",
+                ),
+            ),
         ],
     )
     @require_vision
diff --git a/trl/chat_template_utils.py b/trl/chat_template_utils.py
index b754b5b0316..fccb0a578e3 100644
--- a/trl/chat_template_utils.py
+++ b/trl/chat_template_utils.py
@@ -332,6 +332,8 @@ def clone_chat_template(
 
 qwen3_5_chat_template_4b_and_above = (_CHAT_TEMPLATES_DIR / "qwen3_5_4b_and_above.jinja").read_text()
 
+qwen3_6_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_6.jinja").read_text()
+
 
 ProcessingClassT = TypeVar("ProcessingClassT", PreTrainedTokenizerBase, ProcessorMixin)
 
@@ -384,7 +386,11 @@ def add_response_schema(processing_class: ProcessingClassT) -> ProcessingClassT:
         tokenizer.response_schema = llama3_schema
     elif chat_template in [qwen3_chat_template, qwen3_vl_chat_template]:
         tokenizer.response_schema = qwen3_schema
-    elif chat_template in [qwen3_5_chat_template_2b_and_below, qwen3_5_chat_template_4b_and_above]:
+    elif chat_template in [
+        qwen3_5_chat_template_2b_and_below,
+        qwen3_5_chat_template_4b_and_above,
+        qwen3_6_chat_template,
+    ]:
         tokenizer.response_schema = qwen3_5_schema
     else:
         raise ValueError(
@@ -539,6 +545,8 @@ def is_chat_template_prefix_preserving(processing_class: PreTrainedTokenizerBase
 
 qwen3_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_training.jinja").read_text()
 
+qwen3_6_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_6_training.jinja").read_text()
+
 
 def get_training_chat_template(tokenizer: PreTrainedTokenizerBase) -> str | None:
     r"""
@@ -546,8 +554,8 @@ def get_training_chat_template(tokenizer: PreTrainedTokenizerBase) -> str | None
 
     Returns a patched chat template that is prefix-preserving and includes `{%% generation %%}` / `{%% endgeneration
     %%}` markers for assistant-only loss masking. Returns `None` if the tokenizer's template already satisfies both
-    requirements. Currently DeepSeek-V3, Gemma, Gemma2, GLM-4-MoE, GPT-OSS, LLaMA 3, Phi-3, Qwen2.5, and Qwen3 are
-    supported.
+    requirements. Currently DeepSeek-V3, Gemma, Gemma2, GLM-4-MoE, GPT-OSS, LLaMA 3, Phi-3, Qwen2.5, Qwen3, and Qwen3.6
+    are supported.
 
     Args:
         tokenizer (`PreTrainedTokenizerBase`):
@@ -622,6 +630,9 @@ def get_training_chat_template(tokenizer: PreTrainedTokenizerBase) -> str | None
     if tokenizer.chat_template == qwen3_chat_template:
         return qwen3_training_chat_template
 
+    if tokenizer.chat_template == qwen3_6_chat_template:
+        return qwen3_6_training_chat_template
+
     raise ValueError(
         "The tokenizer's chat template is not training-compatible (missing prefix-preservation or "
         "`{% generation %}` markers) and patching is not supported for this template. "
diff --git a/trl/chat_templates/README.md b/trl/chat_templates/README.md
index 2c07893c07d..9b9f0243081 100644
--- a/trl/chat_templates/README.md
+++ b/trl/chat_templates/README.md
@@ -53,6 +53,10 @@ Original Qwen3-VL chat template. Unlike text-only Qwen3, this template is alread
 
 Original Qwen3.5 chat templates.
 
+### `qwen3_6.jinja`
+
+Original Qwen3.6 chat template (shared across `Qwen3.6-27B`, `Qwen3.6-35B-A3B`, and their FP8 variants). Differs from `qwen3_5_4b_and_above.jinja` by adding a `preserve_thinking` flag and tweaking how non-string tool-call argument values are stringified.
+
 ## Training templates
 
 Patched templates that fix training-specific issues. Swapped in at init when tools are enabled (GRPO) or when `assistant_only_loss=True` (SFT).
@@ -135,3 +139,7 @@ Always include the thinking block regardless of message position. The original c
 ```
 
 Wrap assistant message output with `{% generation %}` / `{% endgeneration %}` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `qwen3_6_training.jinja`
+
+Patched Qwen3.6 template. Same diff as `qwen3_training.jinja` (require both `<think>` and `</think>` before parsing, drop the `loop.index0 > ns.last_query_index` conditional so the thinking block is always emitted, wrap assistant output in `{% generation %}` / `{% endgeneration %}`), applied to the Qwen3.6 base template.
diff --git a/trl/chat_templates/qwen3_6.jinja b/trl/chat_templates/qwen3_6.jinja
new file mode 100644
index 00000000000..a8755d827c0
--- /dev/null
+++ b/trl/chat_templates/qwen3_6.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/trl/chat_templates/qwen3_6_training.jinja b/trl/chat_templates/qwen3_6_training.jinja
new file mode 100644
index 00000000000..e4e705768d7
--- /dev/null
+++ b/trl/chat_templates/qwen3_6_training.jinja
@@ -0,0 +1,162 @@
+{#- Training variant of the Qwen3.6 chat template (see qwen3_6.jinja for the original).
+    Modifications vs the original:
+    - {%- if '</think>' in content %} → {%- if '<think>' in content and '</think>' in content %}
+      Always check for both tags to avoid edge cases where the model generates only one tag.
+    - Removed the loop.index0 > ns.last_query_index conditional; always include thinking block.
+      This makes the template prefix-preserving for the [user, assistant] → [user, assistant, tool] transition.
+    - Added {% generation %} / {% endgeneration %} around assistant message output to support
+      assistant-only loss masking in SFT training.
+-#}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '<think>' in content and '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- generation %}
+        {{- '<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+        {%- endgeneration %}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file

From 07e65d736e7c8a9f90cd68040cd04eca8b3bb045 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Sun, 26 Apr 2026 11:28:01 -0400
Subject: [PATCH 16/20] Release: v1.3 (#5647)

---
 .github/workflows/tests_latest.yml | 2 +-
 CITATION.cff                       | 2 +-
 VERSION                            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests_latest.yml b/.github/workflows/tests_latest.yml
index 54debaba39d..94265905b40 100644
--- a/.github/workflows/tests_latest.yml
+++ b/.github/workflows/tests_latest.yml
@@ -26,7 +26,7 @@ jobs:
     steps:
       - name: Git checkout
         uses: actions/checkout@v6
-        with: { ref: v1.2-release }
+        with: { ref: v1.3-release }
 
       - name: Set up Python 3.12
         uses: actions/setup-python@v6
diff --git a/CITATION.cff b/CITATION.cff
index c78b65d38fd..619482508c4 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -37,5 +37,5 @@ keywords:
   - language model alignment
   - post-training
 license: Apache-2.0
-version: '1.2'
+version: '1.3'
 date-released: '2020-03-27'
diff --git a/VERSION b/VERSION
index 14c65ab0d00..589268e6fed 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.3.0.dev0
\ No newline at end of file
+1.3.0
\ No newline at end of file

From 7198c14b6e3ceabecf1af74053b3c9b10051c639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Sun, 26 Apr 2026 11:29:30 -0400
Subject: [PATCH 17/20] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Bump=20dev=20version?=
 =?UTF-8?q?=20(#5648)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 589268e6fed..b58da95673d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.3.0
\ No newline at end of file
+1.4.0.dev0
\ No newline at end of file

From 71b82192d5192a6d860b4552c8ac17a3c21dec23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Mon, 27 Apr 2026 16:21:55 +0000
Subject: [PATCH 18/20] Add Qwen3.6 model generation script with updated
 configuration

---
 .../qwen3_6_for_conditional_generation.py     | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 scripts/generate_tiny_models/for_conditional_generation/qwen3_6_for_conditional_generation.py

diff --git a/scripts/generate_tiny_models/for_conditional_generation/qwen3_6_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/qwen3_6_for_conditional_generation.py
new file mode 100644
index 00000000000..a0f5bef6e7a
--- /dev/null
+++ b/scripts/generate_tiny_models/for_conditional_generation/qwen3_6_for_conditional_generation.py
@@ -0,0 +1,63 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Notes:
+# - Qwen3.6 reuses the Qwen3_5Moe class with extra MoE config fields
+#   (num_experts, num_experts_per_tok, moe_intermediate_size, shared_expert_intermediate_size).
+# - Same layer_types/full_attention_interval workaround as Qwen3.5: tiny models (2 layers) need
+#   one full-attention layer to keep the dynamic cache happy.
+# - The vision config expects `depth`/`num_heads` (not `num_hidden_layers`/`num_attention_heads`).
+# - Unlike Qwen3.5, Qwen3.6 stores linear-attn weights in bf16, so no float32 cast is needed.
+
+import torch
+from transformers import AutoConfig, AutoProcessor, GenerationConfig, Qwen3_5MoeForConditionalGeneration
+
+from .._common import check_dtype_pattern, check_transformers_version, print_config_diff, push_to_hub, smoke_test
+
+
+TRANSFORMERS_VERSION = "5.2.0"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3.6-35B-A3B"
+
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+
+text_config = {
+    "num_hidden_layers": 2,
+    "hidden_size": 16,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 2,
+    "layer_types": ["linear_attention", "full_attention"],
+    "full_attention_interval": 2,
+    "num_experts": 4,
+    "num_experts_per_tok": 2,
+    "moe_intermediate_size": 32,
+    "shared_expert_intermediate_size": 32,
+}
+vision_config = {
+    "hidden_size": 16,
+    "depth": 2,
+    "num_heads": 4,
+    "intermediate_size": 32,
+    "out_hidden_size": 16,
+}
+
+config = AutoConfig.from_pretrained(MODEL_ID, text_config=text_config, vision_config=vision_config)
+model = Qwen3_5MoeForConditionalGeneration(config).to(dtype=torch.bfloat16)
+
+smoke_test(model, processor)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, processor, generation_config, "tiny", "3.6")

From 545e5e95231ce5cad304463f9de956d5947974be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Mon, 27 Apr 2026 16:24:32 +0000
Subject: [PATCH 19/20] merge main


From 4730fecada9be0370f9e271b93ca1cc09cc4885d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Tue, 28 Apr 2026 00:14:39 +0000
Subject: [PATCH 20/20] Qwen3 Instruct-2507

---
 .../qwen3_for_causal_lm_instruct_2507.py      | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm_instruct_2507.py

diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm_instruct_2507.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm_instruct_2507.py
new file mode 100644
index 00000000000..6f84c69005b
--- /dev/null
+++ b/scripts/generate_tiny_models/for_causal_lm/qwen3_for_causal_lm_instruct_2507.py
@@ -0,0 +1,50 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Qwen3-4B-Instruct-2507 ships the non-thinking chat template, distinct from the default Qwen3 template.
+
+import torch
+from transformers import AutoTokenizer, GenerationConfig, Qwen3Config, Qwen3ForCausalLM
+
+from .._common import (
+    check_dtype_pattern,
+    check_transformers_version,
+    init_weights_tiny_model,
+    print_config_diff,
+    push_to_hub,
+    smoke_test,
+)
+
+
+TRANSFORMERS_VERSION = "4.56.2"
+check_transformers_version(TRANSFORMERS_VERSION)
+
+MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+generation_config = GenerationConfig.from_pretrained(MODEL_ID)
+config = Qwen3Config(
+    vocab_size=len(tokenizer.vocab),
+    hidden_size=8,
+    num_attention_heads=4,
+    num_key_value_heads=2,
+    num_hidden_layers=2,
+    intermediate_size=32,
+)
+model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+smoke_test(model, tokenizer)
+check_dtype_pattern(MODEL_ID, model)
+print_config_diff(MODEL_ID, model)
+push_to_hub(model, tokenizer, generation_config, "tiny", "Instruct-2507")