Misc improvements

chiragjn · chiragjn · commit 30c52ec00159 · 2025-06-09T08:59:50.000Z
Enable callbacks injection from plugins

Fix misc issues with axolotl plugins

Fix remote code checking

Enable loss average across devices

Add seq len validation

Enhance sequence lens validation

Remove legacy code for patching _get_unpad_data

Add pre truncation token counting for completion

Fix plugin callbacks duplication

Enable eval on start

Read extra hf args from cfg
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
@@ -439,7 +439,7 @@ def _configure_gradient_checkpointing(self, training_args_kwargs: dict):
     def _set_base_training_args(
         self, total_num_steps
     ) -> tuple[dict[str, Any], dict[str, Any]]:
-        training_args_kwargs: dict[str, Any] = {}
+        training_args_kwargs: dict[str, Any] = self.cfg.get("extra_hf_training_args") or {}
         trainer_kwargs: dict[str, Any] = {}
 
         self._configure_warmup_and_logging(total_num_steps, training_args_kwargs)
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
@@ -3,6 +3,7 @@
 import json
 import os
 
+from axolotl.utils.dict import DictDefault
 import transformers
 from transformers import (
     AddedToken,
@@ -185,6 +186,12 @@ def load_tokenizer(cfg):
                 setattr(tokenizer, attr_name, "<|endoftext|>")
 
     additional_special_tokens = None
+
+    if not tokenizer.pad_token:
+        if not cfg.special_tokens:
+            cfg.special_tokens = DictDefault({})
+        cfg.special_tokens.pad_token = tokenizer.eos_token
+
     if cfg.special_tokens:
         special_tokens = cfg.special_tokens.to_dict()
         additional_special_tokens = special_tokens.pop(
diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
@@ -96,6 +96,12 @@ def format(self, record):
             "filters": [],
             "stream": sys.stdout,
         },
+        "file": {
+            "class": "logging.FileHandler",
+            "formatter": "simple",
+            "filename": "train.log",
+            "mode": "w",
+        },
     },
     # log level will be superseded by the AxolotlLogger
     "root": {
@@ -104,7 +110,7 @@ def format(self, record):
     },
     "loggers": {
         "axolotl": {
-            "handlers": ["color_console"],
+            "handlers": ["color_console", "file"],
             "level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL),
             "propagate": False,
         },
diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -50,6 +50,12 @@ def tokenize_prompt(self, prompt):
         tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
         tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
 
+        if "num_tokens_pre_truncation" in tokenized_prompt:
+            tokenized_prompt["num_tokens_pre_truncation"] = (
+                tokenized_prompt["num_tokens_pre_truncation"]
+                + tokenized_res_prompt["num_tokens_pre_truncation"]
+            )
+
         return tokenized_prompt
 
 
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
@@ -94,20 +94,26 @@ def build_prompt(self, conversation, add_generation_prompt=False, images=None):
                 images=images,
                 return_tensors="pt",
             )
+            # dict_keys(['input_ids', 'attention_mask', 'pixel_values'])
             # workaround since processor works in batches instead of single examples
             for k, val in batch.items():
                 if k in ["pixel_values"]:
                     batch[k] = val.tolist()
                 else:
                     batch[k] = val.squeeze().tolist()
+            batch["num_tokens_pre_truncation"] = len(batch["input_ids"])
             return batch
 
-        return self.tokenizer.apply_chat_template(
+        input_ids = self.tokenizer.apply_chat_template(
             conversation,
             add_generation_prompt=add_generation_prompt,
             chat_template=self.chat_template,
             **self.chat_template_kwargs,
         )
+        return {
+            "input_ids": input_ids,
+            "num_tokens_pre_truncation": len(input_ids),
+        }
 
     def get_offsets_for_train_detail(
         self, text: str, train_details: List[Dict], mask_untrainable: bool = True
@@ -377,21 +383,25 @@ def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
         ):
             turns = self.get_conversation_thread(prompt)
             images = self.get_images(prompt)
-            prompt_ids = self.prompter.build_prompt(  # type: ignore
+            # We get back {"input_ids": [...], "num_tokens_pre_truncation": ...}
+            _prompt_ids = self.prompter.build_prompt(
                 turns[:-1],
                 add_generation_prompt=True,
                 images=images,
             )
+            prompt_ids = _prompt_ids["input_ids"]
             tokenized_res = self.prompter.build_prompt(
                 turns, images=images
             )  # type: ignore
             tokenized_prompt = {}
-            if isinstance(tokenized_res, list):
-                input_ids = prompt_ids + tokenized_res[len(prompt_ids) :]
+            if "attention_mask" not in tokenized_res:
+                input_ids = prompt_ids + tokenized_res["input_ids"][len(prompt_ids) :]
                 tokenized_prompt["input_ids"] = input_ids
+                num_tokens_pre_truncation = tokenized_res["num_tokens_pre_truncation"]
                 tokenized_prompt["attention_mask"] = [1] * len(input_ids)
             else:
                 input_ids = tokenized_res["input_ids"]
+                num_tokens_pre_truncation = tokenized_res["num_tokens_pre_truncation"]
                 tokenized_prompt = tokenized_res
 
             if not self.train_on_inputs:
@@ -401,11 +411,14 @@ def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
                 labels = input_ids
 
             tokenized_prompt["labels"] = labels
+            tokenized_prompt["num_tokens_pre_truncation"] = num_tokens_pre_truncation
 
             return tokenized_prompt
 
         turns = self.get_conversation_thread(prompt)
-        input_ids = self.prompter.build_prompt(turns)  # type: ignore
+        tokenized_res = self.prompter.build_prompt(turns)
+        input_ids = tokenized_res["input_ids"]
+        num_tokens_pre_truncation = tokenized_res["num_tokens_pre_truncation"]
         labels = [IGNORE_TOKEN_ID] * len(input_ids)
 
         last_eos_idx = -1
@@ -518,6 +531,7 @@ def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
             "input_ids": input_ids,
             "labels": labels,
             "attention_mask": [1] * len(input_ids),
+            "num_tokens_pre_truncation": num_tokens_pre_truncation,
         }
 
     def find_first_eos_token(self, input_ids, start_idx):
@@ -577,10 +591,10 @@ def find_turn(self, turns: list[dict], turn_idx: int):
         turns_with_content = turns[: turn_idx + 1]
 
         # Generate the conversation up to the turn, with final turn replaced with dummy content
-        dummy_ids = self.prompter.build_prompt(turns_with_empty)  # type: ignore
+        dummy_ids = self.prompter.build_prompt(turns_with_empty)["input_ids"]  # type: ignore
 
         # Generate the conversation up to the turn, with final turn included
-        full_ids = self.prompter.build_prompt(turns_with_content)  # type: ignore
+        full_ids = self.prompter.build_prompt(turns_with_content)["input_ids"]  # type: ignore
 
         if not full_ids or not dummy_ids:
             LOG.warning(f"Empty template generated for turn {turn_idx}")
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
@@ -1,6 +1,7 @@
 """Module containing PromptTokenizingStrategy and Prompter classes"""
 
 import abc
+import functools
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 from transformers import BatchEncoding, PreTrainedTokenizer
@@ -62,18 +63,23 @@ def supports_batched(self):
     def _tokenize(
         self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
     ) -> BatchEncoding:
-        empty = BatchEncoding(data={"input_ids": [], "attention_mask": []})
+        empty = BatchEncoding(
+            data={"input_ids": [], "attention_mask": [], "num_tokens_pre_truncation": 0}
+        )
         if not prompt:
             LOG.warning("Empty text requested for tokenization.")
             return empty
 
-        result = self.tokenizer(
-            prompt,
-            truncation=True,
+        _tokenize = functools.partial(
+            self.tokenizer,
             max_length=self.max_length,
             padding=False,
             return_tensors=None,
         )
+        result = _tokenize(
+            prompt,
+            truncation=True,
+        )
         if len(result["input_ids"]) == 0:
             LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
             return empty
@@ -91,6 +97,20 @@ def _tokenize(
             result["attention_mask"] = result["attention_mask"][1:]
 
         result["labels"] = result["input_ids"].copy()
+
+        _all_tokens = _tokenize(prompt, truncation=False)
+        num_tokens_pre_truncation = len(_all_tokens["input_ids"])
+        if (
+            _all_tokens["input_ids"][-1] != self.tokenizer.eos_token_id
+            and add_eos_token
+        ):
+            num_tokens_pre_truncation += 1
+        if (
+            _all_tokens["input_ids"][0] == self.tokenizer.bos_token_id
+            and strip_bos_token
+        ):
+            num_tokens_pre_truncation -= 1
+        result["num_tokens_pre_truncation"] = num_tokens_pre_truncation
         return result
 
 
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
@@ -16,7 +16,11 @@
 from datasets import Dataset
 from huggingface_hub.errors import OfflineModeIsEnabled
 from peft import PeftConfig, PeftModel
-from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+from transformers import (
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    ProcessorMixin,
+)
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 
@@ -25,7 +29,6 @@
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
     fix_untrained_tokens,
 )
-from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders import (
     ModelLoader,
@@ -83,6 +86,9 @@ def setup_model_and_tokenizer(
     if model.generation_config is not None:
         model.generation_config.do_sample = True
 
+    plugin_manager = PluginManager.get_instance()
+    plugin_manager.post_model_load(cfg, model)
+
     # Apply freezing if specified
     if cfg.unfrozen_parameters:
         freeze_layers_except(model, cfg.unfrozen_parameters)
@@ -159,7 +165,11 @@ def setup_signal_handler(
         safe_serialization: Whether to use safe serialization when saving
     """
     # ray workers don't have access to this signal
-    if cfg.local_rank == 0 and not cfg.use_ray:
+    if (
+        cfg.local_rank == 0
+        and not cfg.use_ray
+        and cfg.get("save_model_on_interrupt", True)
+    ):
 
         def terminate_handler(_, __, model_weakref):
             if model_weakref() is not None:
@@ -472,7 +482,7 @@ def handle_untrained_tokens_fix(
 
 
 def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
-    HFRLTrainerBuilder | HFCausalTrainerBuilder,
+    Trainer,
     PeftModel | PreTrainedModel,
     PreTrainedTokenizer,
     PeftConfig | None,
@@ -573,8 +583,14 @@ def train(
     # Save the trained model and cleanup
     save_trained_model(cfg, trainer, model, safe_serialization)
     create_model_card(cfg, trainer)
-    if not cfg.use_ray:
-        cleanup_distributed()
+
+    if cfg.deepspeed:
+        trainer.deepspeed.destroy()
+    trainer.accelerator.free_memory()
+    trainer.model, trainer.model_wrapped, trainer.optimizer = None, None, None
+
+    # if not cfg.use_ray:
+    #     cleanup_distributed()
 
     plugin_manager.post_train(cfg, model)
 
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
@@ -52,7 +52,11 @@
     retry_on_request_exceptions,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_local_main_process, zero_first
+from axolotl.utils.distributed import (
+    compute_and_broadcast,
+    is_local_main_process,
+    zero_first,
+)
 from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import (
     calculate_total_num_steps,
@@ -174,9 +178,15 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
     if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
         total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
         if total_eval_steps == 0:
-            raise ValueError(
-                "eval dataset split is too small for sample_packing. You should set `eval_sample_packing: False`. "
+            LOG.warning(
+                "eval dataset split is too small for sample_packing. Setting `eval_sample_packing to False`."
             )
+            if cfg.world_size > 1:
+                _eval_sample_packing = compute_and_broadcast(lambda: 0)
+                if _eval_sample_packing < 1:
+                    cfg.eval_sample_packing = False
+            else:
+                cfg.eval_sample_packing = False
 
     if cfg.max_steps:
         total_num_steps = min(
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
diff --git a/src/axolotl/utils/samplers/utils.py b/src/axolotl/utils/samplers/utils.py