k1064190 · k1064190 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
@@ -12,7 +12,7 @@ env:
 
 jobs:
    build:
-    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391  # main
     with:
       commit_sha: ${{ github.sha }}
       package: trl

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -13,7 +13,7 @@ concurrency:
 jobs:
   build:
     if: github.event.pull_request.draft == false
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391  # main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}

diff --git a/.github/workflows/tests_latest.yml b/.github/workflows/tests_latest.yml
@@ -26,7 +26,7 @@ jobs:
     steps:
       - name: Git checkout
         uses: actions/checkout@v6
-        with: { ref: v1.2-release }
+        with: { ref: v1.3-release }
 
       - name: Set up Python 3.12
         uses: actions/setup-python@v6

diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6  # main
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391  # main
     with:
       package_name: trl
     secrets:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,10 +8,12 @@ repos:
       - id: ruff-format
         types_or: [ python, pyi ]
 
-  # - repo: https://github.com/codespell-project/codespell
-  #   rev: v2.1.0
-  #   hooks:
-  #     - id: codespell
-  #       args:
-  #         - --ignore-words-list=nd,reacher,thist,ths,magent,ba
-  #         - --skip=docs/css/termynal.css,docs/js/termynal.js
+  - repo: local
+    hooks:
+      - id: doc-builder-style
+        name: Check style with doc-builder
+        language: python
+        entry: doc-builder style trl tests docs/source --max_len 119
+        additional_dependencies: ["git+https://github.com/huggingface/doc-builder@2430c1ec91d04667414e2fa31ecfc36c153ea391", ruff]  # See GH-5633
+        pass_filenames: false
+        types_or: [python, markdown, rst]
diff --git a/CITATION.cff b/CITATION.cff
@@ -37,5 +37,5 @@ keywords:
   - language model alignment
   - post-training
 license: Apache-2.0
-version: '1.2'
+version: '1.3'
 date-released: '2020-03-27'
diff --git a/Makefile b/Makefile
@@ -10,7 +10,6 @@ test:
 precommit:
 	python scripts/add_copyrights.py
 	pre-commit run --all-files
-	doc-builder style trl tests docs/source --max_len 119
 
 slow_tests:
 	pytest -m "slow" tests/ $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.3.0.dev0
+1.4.0.dev0
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -7,6 +7,8 @@
     title: Quickstart
   title: Getting started
 - sections:
+  - local: chat_templates
+    title: Chat Templates
   - local: dataset_formats
     title: Dataset Formats
   - local: paper_index
@@ -133,6 +135,8 @@
     title: SDPO
   - local: ssd_trainer
     title: SSD
+  - local: tpo_trainer
+    title: TPO
   - local: xpo_trainer
     title: XPO
   title: Experimental
diff --git a/docs/source/chat_template_utils.md b/docs/source/chat_template_utils.md
@@ -1,5 +1,7 @@
 # Chat template utilities
 
+For an overview of the chat templates bundled with TRL and the rationale behind the training patches, see [Chat Templates](chat_templates).
+
 ## clone_chat_template
 
 [[autodoc]] clone_chat_template

diff --git a/docs/source/chat_templates.md b/docs/source/chat_templates.md
@@ -0,0 +1,113 @@
+# Chat Templates
+
+A [chat template](https://huggingface.co/docs/transformers/en/chat_templating) is a Jinja2 snippet that formats messages into the string a model was trained on. For example:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+>>> tokenizer.chat_template
+"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+>>> tokenizer.apply_chat_template([{"role": "user", "content": "Hi!"}], tokenize=False)
+'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHi!<|im_end|>\n'
+```
+
+In most cases you don't need to worry about chat templates: models ship their template along with the tokenizer, and TRL applies it for you. The whole thing is transparent. But some TRL recipes rely on features that most shipped templates don't include:
+
+- **SFT with `assistant_only_loss=True`** needs `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers around assistant output, so the loss mask can target only assistant tokens.
+- **GRPO with tool calls** needs the template to be *prefix-preserving*: appending a tool message must not change how earlier messages are rendered.
+
+TRL ships patched templates under [`trl/chat_templates/`](https://github.com/huggingface/trl/tree/main/trl/chat_templates) for common families (Qwen, Llama, DeepSeek-V3, GPT-OSS, ...) and swaps them in automatically for supported models. For any other model, you'll need to patch its template yourself. The rest of this page catalogs what's bundled.
+
+## Supported model families
+
+TRL stores reference copies of the original templates so it can identify supported models at init and swap in a training template when needed. The following families are recognized: Cohere, DeepSeek-V3, Gemma, GLM-4-MoE, GPT-OSS, Llama 3 / 3.1 / 3.2, Qwen2.5, Qwen3, Qwen3-VL, Qwen3.5, Qwen3.6.
+
+## Training templates
+
+Patched templates that fix training-specific issues. Swapped in at init when tools are enabled (GRPO) or when `assistant_only_loss=True` (SFT).
+
+### `cohere_training.jinja`
+
+Patched Cohere template. Diff vs `cohere.jinja`:
+
+Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `deepseekv3_training.jinja`
+
+Patched DeepSeek-V3 template. Diff vs `deepseekv3.jinja`:
+
+- Uses `| tojson` on `tool['function']['arguments']` so that `arguments` can be passed as a `dict` (the documented format per [transformers docs](https://huggingface.co/docs/transformers/en/chat_extras#tool-calling-example)). The original template uses raw string concatenation, which crashes on dict inputs.
+- Wraps assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers for SFT assistant-only loss.
+
+### `gemma_training.jinja`
+
+Patched Gemma template (shared by Gemma and Gemma2, which ship identical chat templates). Diff vs `gemma.jinja`:
+
+Split the unified assistant output so that the `<start_of_turn>model\n` header (a prompt cue, not generated by the model) sits outside the generation block, and wrap the assistant content with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers for SFT assistant-only loss.
+
+### `glm4moe_training.jinja`
+
+Patched GLM-4-MoE template. Diff vs `glm4moe.jinja`:
+
+Require both `<think>` and `</think>` to be present before parsing, to avoid incorrect splitting when the model generates only one tag:
+
+```diff
+- {%- if '</think>' in content %}
++ {%- if '<think>' in content and '</think>' in content %}
+```
+
+Wrap assistant message output (including the thinking block and tool calls) with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers for SFT assistant-only loss.
+
+### `qwen3_training.jinja`
+
+Patched Qwen3 template. Diff vs `qwen3.jinja`:
+
+Require both `<think>` and `</think>` to be present before parsing, to avoid incorrect splitting when the model generates only one tag:
+
+```diff
+- {%- if '</think>' in content %}
++ {%- if '<think>' in content and '</think>' in content %}
+```
+
+Always include the thinking block regardless of message position. The original conditionally omits it based on `loop.last`, which changes the assistant rendering when a tool message is appended, breaking prefix-preservation:
+
+```diff
+- {%- if loop.index0 > ns.last_query_index %}
+-     {%- if loop.last or (not loop.last and reasoning_content) %}
+-         {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+-     {%- else %}
+-         {{- '<|im_start|>' + message.role + '\n' + content }}
+-     {%- endif %}
+- {%- else %}
+-     {{- '<|im_start|>' + message.role + '\n' + content }}
+- {%- endif %}
++ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+```
+
+Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `gptoss_training.jinja`
+
+Patched GPT-OSS template. Diff vs `gptoss.jinja`:
+
+Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `llama3_training.jinja`
+
+Patched Llama 3 template. Diff vs `llama3.jinja`:
+
+Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `qwen2_5_training.jinja`
+
+Patched Qwen2.5 template. Diff vs `qwen2_5.jinja`:
+
+Wrap assistant message output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` so that `return_assistant_tokens_mask=True` produces correct masks for SFT assistant-only loss.
+
+### `qwen3_6_training.jinja`
+
+Patched Qwen3.6 template. Diff vs `qwen3_6.jinja`: same set of changes as `qwen3_training.jinja` — require both `<think>` and `</think>` to be present before parsing, drop the `loop.index0 > ns.last_query_index` conditional so the thinking block is always emitted (prefix-preservation), and wrap assistant output with `&#123;% generation %&#125;` / `&#123;% endgeneration %&#125;` markers for SFT assistant-only loss.
+
+## Related utilities
+
+See [Chat Template Utilities](chat_template_utils) for the helper functions ([`clone_chat_template`], [`is_chat_template_prefix_preserving`], [`get_training_chat_template`]) that operate on these templates.
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -632,6 +632,9 @@ trainer = GRPOTrainer(
 Each tool must be a standard Python function with **type-hinted arguments and return types**, along with a **Google-style docstring** describing its purpose, arguments, and return value.
 For more details, see the [Passing tools guide](https://huggingface.co/docs/transformers/en/chat_extras#passing-tools).
 
+> [!TIP]
+> The GRPO tool call loop requires the chat template to be *prefix-preserving* (appending a tool message must not change how earlier messages are rendered). For known model families (e.g. Qwen3, DeepSeek-V3), TRL automatically swaps in a patched training template when tools are enabled. See [Chat Templates](chat_templates#training-templates) for the full list.
+
 Example:
 
 ```python
@@ -748,6 +751,7 @@ Tested with:
 - [**Qwen3**](https://huggingface.co/collections/Qwen/qwen3) — e.g., `Qwen/Qwen3-0.6B`
 - [**Qwen3-VL**](https://huggingface.co/collections/Qwen/qwen3-vl) — e.g., `Qwen/Qwen3-VL-2B-Instruct`
 - [**Qwen3.5**](https://huggingface.co/collections/Qwen/qwen35) — e.g., `Qwen/Qwen3.5-2B`
+- [**Qwen3.6**](https://huggingface.co/collections/Qwen/qwen36) — e.g., `Qwen/Qwen3.6-35B-A3B`
 
 > [!TIP]
 > Compatibility with all LLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.

diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -1403,6 +1403,44 @@ training_args = CPOConfig(
 )
 ```
 
+## Triple Preference Optimization
+
+Papers relating to the [`experimental.tpo.TPOTrainer`]
+
+### Triple Preference Optimization: Achieving Better Alignment using a Single Step Optimization
+
+**📜 Paper**: https://huggingface.co/papers/2405.16681
+
+Introduces Triple Preference Optimization (TPO), a preference learning method that aligns an LLM with three responses per prompt — a gold (`reference`) completion, a preferred (`chosen`) completion and a dispreferred (`rejected`) completion — in a single optimization step. TPO combines a contrastive objective on the (chosen, rejected) pair with a supervised NLL term on the gold response, removing the need for a separate SFT stage and the reference model used in DPO. Used in TRL via [`experimental.tpo.TPOTrainer`]. To reproduce the paper's setting (Llama-3-Base, 5K), use this configuration:
+
+```python
+from trl.experimental.tpo import TPOConfig
+
+training_args = TPOConfig(
+    loss_type="sigmoid",  # contrastive loss between chosen and rejected (Section 3 of the paper)
+    tpo_alpha=1.0,  # weight of the NLL term on the gold response (Section 3 of the paper)
+    beta=0.01,  # β temperature (Table 6 of the paper)
+    learning_rate=5e-7,  # Table 6 of the paper
+    num_train_epochs=1,
+    max_length=1024,
+)
+```
+
+To use the TPO-L variant (length-normalized log-probabilities with a target reward margin γ), set `loss_type="tpo-l"` and `tpo_l_gamma`:
+
+```python
+from trl.experimental.tpo import TPOConfig
+
+training_args = TPOConfig(
+    loss_type="tpo-l",  # length-normalized variant (Section 3 of the paper)
+    tpo_alpha=1.0,
+    beta=0.01,
+    tpo_l_gamma=0.5,  # γ target reward margin (Table 6 of the paper, Llama-3-Base 5K)
+    learning_rate=5e-7,
+    num_train_epochs=1,
+)
+```
+
 ## Nash Learning from Human Feedback
 
 Papers relating to the [`experimental.nash_md.NashMDTrainer`]