From 549ec1b6c7a5dadf2bd9efc349886462dd3c4279 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 6 Mar 2024 12:55:42 -0800 Subject: [PATCH 01/33] make some changes for the auto trainer using the DPO trainer --- huggingface_dpo/huggingface_auto_trainer.py | 855 ++++++++++++++++++++ huggingface_dpo/huggingface_dpo.py | 855 ++++++++++++++++++++ 2 files changed, 1710 insertions(+) create mode 100644 huggingface_dpo/huggingface_auto_trainer.py create mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py new file mode 100644 index 000000000..d1166318c --- /dev/null +++ b/huggingface_dpo/huggingface_auto_trainer.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..d1166318c --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) From cca1e7ee28c169171b39f50095ed03f43ffd390c Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 6 Mar 2024 12:56:13 -0800 Subject: [PATCH 02/33] adding the dpo from trl --- huggingface_dpo/huggingface_auto_trainer.py | 855 -------------------- 1 file changed, 855 deletions(-) delete mode 100644 huggingface_dpo/huggingface_auto_trainer.py diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py deleted file mode 100644 index d1166318c..000000000 --- a/huggingface_dpo/huggingface_auto_trainer.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) From 01c1d08bd35d449db3d56e5fc28632426cd998b8 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 19:05:45 -0700 Subject: [PATCH 03/33] should use dpo_trainer for dpo training --- huggingface_dpo/huggingface_dpo.py | 1 + huggingface_dpo/test_huggingface_dpo.py | 42 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index d1166318c..bf2ed3cf0 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -17,6 +17,7 @@ from mlrun.datastore import is_store_uri from mlrun.frameworks._common import CommonTypes, MLRunInterface from mlrun.utils import logger +from trl import DPOTrainer from peft import (LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training) from plotly import graph_objects as go diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py new file mode 100644 index 000000000..53576e4e7 --- /dev/null +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -0,0 +1,42 @@ +import tempfile + +import mlrun + + +def test_train(): + + model_name = "distilgpt2" + tokenizer = model_name + auto_trainer = mlrun.import_function("function.yaml") + + training_arguments = { + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 1, + "warmup_steps": 2, + "max_steps": 10, + "learning_rate": 2e-4, + "logging_steps": 1, + } + + params = { + "model": (model_name, "transformers.AutoModelForCausalLM"), + "tokenizer": tokenizer, + "train_dataset": "Abirate/english_quotes", + "training_config": training_arguments, + "dataset_columns_to_train": "quote", + "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, + } + + try: + with tempfile.TemporaryDirectory() as test_directory: + auto_trainer.run( + local=True, + params=params, + handler="finetune_llm", + returns=["model"], + workdir=test_directory, + ) + + except Exception as exception: + print(f"- The training failed - raised the following error:\n- {exception}") From c691afcb446c079426051fce3ca7e7d45bd12809 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:20:16 +0000 Subject: [PATCH 04/33] adding the req --- huggingface_dpo/requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 huggingface_dpo/requirements.txt diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt new file mode 100644 index 000000000..1376b1d00 --- /dev/null +++ b/huggingface_dpo/requirements.txt @@ -0,0 +1,5 @@ +peft +transformers +torch +datasets +plotly From c010d6d6bf0bbeaa6ae24ba25b5037e2ec4486c3 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:22:28 +0000 Subject: [PATCH 05/33] using the dpo trainer --- huggingface_dpo/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index 1376b1d00..a86a25fb4 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -3,3 +3,4 @@ transformers torch datasets plotly +trl From 95b5ce53b58fde34e9ed232ab1cb65b3ecc3f58b Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:53:00 +0000 Subject: [PATCH 06/33] adding the mlrun --- huggingface_dpo/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index a86a25fb4..215b90562 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -4,3 +4,4 @@ torch datasets plotly trl +mlrun From 49159194b520be1514caba7e73f824638254888c Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:10:07 -0700 Subject: [PATCH 07/33] adding the dpo trainer --- huggingface_dpo/huggingface_dpo.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index bf2ed3cf0..a8c46b768 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -44,11 +44,12 @@ class ConfigKeys: deepspeed = "deepspeed" quantization = "quantization" - lora = "lora" training = "training" tokenizer_pretrained = "tokenizer_pretrained" model_pretrained = "model_pretrained" + peft_config = "peft_config" data_collator = "data_collator" + beta = "beta" # ----------------------from MLRUN-------------------------------- @@ -70,7 +71,7 @@ class HFTrainerMLRunInterface(MLRunInterface, ABC): @classmethod def add_interface( cls, - obj: Trainer, + obj: DPOTrainer, restoration: CommonTypes.MLRunInterfaceRestorationType = None, ): super(HFTrainerMLRunInterface, cls).add_interface( @@ -79,7 +80,7 @@ def add_interface( @classmethod def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): + def wrapper(self: DPOTrainer, *args, **kwargs): # Restore the evaluation method as `train` will use it: # cls._restore_attribute(obj=self, attribute_name="evaluate") @@ -386,7 +387,6 @@ def _set_model_and_tokenizer( tokenizer: Union[str, List[str]], task: str, framework: str, - lora_config: dict, quantization_config: dict, use_cuda: bool, tokenizer_pretrained_config, @@ -400,7 +400,6 @@ def _set_model_and_tokenizer( :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path :param task: a supported nlp task, used to choose model if not provided :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way :param quantization_config: quantization config or None, to load model in appropriate way :param use_cuda: use gpu or not :param tokenizer_pretrained_config: config to load the pretrained tokenizer @@ -470,10 +469,6 @@ def _set_model_and_tokenizer( model.gradient_checkpointing_enable() model = peft.prepare_model_for_kbit_training(model) - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - # if not specified we choose the default tokenizer that corresponding to the model if tokenizer is None: tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) @@ -639,7 +634,8 @@ def finetune_llm( tokenizer: Union[str, List[str]] = None, deepspeed_config: Union[dict, bool] = False, quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, tokenizer_pretrained_config: dict = {}, @@ -683,11 +679,12 @@ def finetune_llm( configs = { ConfigKeys.deepspeed: deepspeed_config, ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, ConfigKeys.training: training_config, ConfigKeys.model_pretrained: model_pretrained_config, ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, } _update_config(dst=configs, src=kwargs) @@ -705,7 +702,6 @@ def finetune_llm( tokenizer=tokenizer, task=task, framework=framework, - lora_config=configs[ConfigKeys.lora], quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, @@ -744,10 +740,13 @@ def finetune_llm( **train_kwargs, ) - trainer = transformers.Trainer( + trainer = trl.DPOTrainer( model=model, + ref_model = None, train_dataset=tokenized_train, eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], tokenizer=tokenizer, data_collator=data_collator, args=training_args, From 96c08f4d5fbf44f65e0efc1bd3ac2d6d73253ae9 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:13:01 -0700 Subject: [PATCH 08/33] add dpo trainer --- huggingface_dpo/huggingface_dpo.py | 855 ----------------------------- 1 file changed, 855 deletions(-) delete mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py deleted file mode 100644 index a8c46b768..000000000 --- a/huggingface_dpo/huggingface_dpo.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - peft_config = "peft_config" - data_collator = "data_collator" - beta = "beta" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: DPOTrainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: DPOTrainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - peft_config: Union[dict, bool] = False, - beta: Union[float, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - ConfigKeys.peft_config: peft_config, - ConfigKeys.beta: beta, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = trl.DPOTrainer( - model=model, - ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) From 1cb999eb3c4b06f7a39614384ae3a0368f5a6420 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:13:37 -0700 Subject: [PATCH 09/33] added dpo trainer --- huggingface_dpo/huggingface_dpo.py | 855 +++++++++++++++++++++++++++++ 1 file changed, 855 insertions(+) create mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..a8c46b768 --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from trl import DPOTrainer +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + peft_config = "peft_config" + data_collator = "data_collator" + beta = "beta" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: DPOTrainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: DPOTrainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = trl.DPOTrainer( + model=model, + ref_model = None, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) From 7e6af5fc35c1da8ae466a8c45d7d8cc84762edeb Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 23:05:49 +0000 Subject: [PATCH 10/33] continue the coding --- huggingface_dpo/huggingface_dpo.py | 9 +++++---- huggingface_dpo/test_huggingface_dpo.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index a8c46b768..31e418f30 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -244,7 +244,7 @@ def log_metric_plot(self, name: str, scores: List[float]): def apply_mlrun( - trainer: transformers.Trainer, + trainer: trl.DPOTrainer, model_name: str = None, tag: str = "", context: mlrun.MLClientCtx = None, @@ -302,10 +302,11 @@ def _print_trainable_parameters(model): bnb_4bit_compute_dtype=torch.bfloat16, ) -LORA_CONFIG = peft.LoraConfig( +PEFT_CONFIG = peft.LoraConfig( r=8, - lora_alpha=32, - target_modules=["query_key_value"], + lora_alpha=16, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index 53576e4e7..691605c83 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -5,7 +5,7 @@ def test_train(): - model_name = "distilgpt2" + model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name auto_trainer = mlrun.import_function("function.yaml") From d4e0940dbd7b1aa45b66995980ef4542004cd94f Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 16:37:55 +0000 Subject: [PATCH 11/33] should be the same as trainer --- huggingface_dpo/huggingface_dpo.py | 4 ++-- huggingface_dpo/item.yaml | 23 +++++++++++++++++++++++ huggingface_dpo/test_huggingface_dpo.py | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 huggingface_dpo/item.yaml diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index 31e418f30..5f2a680d0 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -347,7 +347,7 @@ def _update_config(src: dict, dst: dict): config = QUANTIZATION_CONFIG if config is True and config_name == "lora": - config = LORA_CONFIG + config = PEFT_CONFIG if config is True and config_name == "deepspeed": config = DEEPSPEED_CONFIG @@ -624,7 +624,7 @@ def _prepare_dataset( return tokenized_train_dataset, tokenized_eval_dataset -def finetune_llm( +def dpo_train( context: mlrun.MLClientCtx, train_dataset: Union[str, mlrun.datastore.DataItem], eval_dataset: str = None, diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml new file mode 100644 index 000000000..4f6cc1c1c --- /dev/null +++ b/huggingface_dpo/item.yaml @@ -0,0 +1,23 @@ + +apiVersion: v1 +categories: [] # List of category names +description: '' # Short description +doc: '' # Path to README.md if exists +example: '' # Path to examole notebook +generationDate: 2024-03-19 16:26:27.342027 +icon: '' # Path to icon file +labels: {} # Key values label pairs +maintainers: [] # List of maintainers +mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema +name: '' # Function name +platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema +spec: + filename: '' # Implementation file + handler: '' # Handler function name + image: '' # Base image name + kind: '' # Function kind + requirements: [] # List of Pythonic library requirements + customFields: {} # Custom spec fields + env: [] # Spec environment params +url: '' +version: 0.0.1 # Function version, should follow standard semantic versioning schema diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index 691605c83..adf70b494 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -33,7 +33,7 @@ def test_train(): auto_trainer.run( local=True, params=params, - handler="finetune_llm", + handler="dpo_train", returns=["model"], workdir=test_directory, ) From 1c26ef12b940c71a634bc9a75fa1ef0dccc38ef4 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 16:41:35 +0000 Subject: [PATCH 12/33] try generate the function.yaml --- huggingface_dpo/test_huggingface_dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index adf70b494..7899debba 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -3,7 +3,7 @@ import mlrun -def test_train(): +def test_dpo_train(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name From 93beb7bcb7bbb5c0bea3291ad96bbb92f742a927 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:04:21 +0000 Subject: [PATCH 13/33] adding the dop_trainer --- .../{huggingface_dpo.py => huggingface_dpo_trainer.py} | 0 .../{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename huggingface_dpo/{huggingface_dpo.py => huggingface_dpo_trainer.py} (100%) rename huggingface_dpo/{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py} (100%) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo_trainer.py similarity index 100% rename from huggingface_dpo/huggingface_dpo.py rename to huggingface_dpo/huggingface_dpo_trainer.py diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo_trainer.py similarity index 100% rename from huggingface_dpo/test_huggingface_dpo.py rename to huggingface_dpo/test_huggingface_dpo_trainer.py From a3c78626af0afe37469abb975ece3bc3c8da3de7 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:05:09 +0000 Subject: [PATCH 14/33] update item --- huggingface_dpo/item.yaml | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml index 4f6cc1c1c..3eff1eede 100644 --- a/huggingface_dpo/item.yaml +++ b/huggingface_dpo/item.yaml @@ -1,23 +1,23 @@ - apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: 2024-03-19 16:26:27.342027 -icon: '' # Path to icon file -labels: {} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema +categories: +- machine-learning +- model-training +description: doing the alignment with dpo trainer +doc: '' +example: huggingface_dpo_trainer.ipynb +generationDate: 2024-03-19:09-25 +hidden: false +icon: '' +labels: + author: pgw +maintainers: [] +marketplaceType: '' +name: huggingface-dpo-trainer spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements - customFields: {} # Custom spec fields - env: [] # Spec environment params + filename: huggingface_dpo_trainer.py + handler: dpo_train + image: mlrun/mlrun + kind: job + requirements: [] url: '' -version: 0.0.1 # Function version, should follow standard semantic versioning schema +version: 1.0.0 From e44d87007a404d2f40f6a45be66d0b1977ec2887 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:07:16 +0000 Subject: [PATCH 15/33] add function yaml file --- huggingface_dpo/function.yaml | 374 ++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 huggingface_dpo/function.yaml diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml new file mode 100644 index 000000000..d0baab33a --- /dev/null +++ b/huggingface_dpo/function.yaml @@ -0,0 +1,374 @@ +kind: job +metadata: + name: huggingface-dpo-trainer + tag: '' + hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676 + project: '' + labels: + author: pgw + categories: + - machine-learning + - model-training +spec: + command: '' + args: [] + image: mlrun/mlrun + build: + functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from trl import DPOTrainer
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    peft_config = "peft_config"
    data_collator = "data_collator"
    beta = "beta"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: DPOTrainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: DPOTrainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: trl.DPOTrainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

PEFT_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = PEFT_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def dpo_train(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    peft_config: Union[dict, bool] = False,
    beta: Union[float, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
        ConfigKeys.peft_config: peft_config,
        ConfigKeys.beta: beta,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = trl.DPOTrainer(
        model=model,
        ref_model = None,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        peft_config=configs[ConfigKeys.peft_config],
        beta = configs[ConfigKeys.beta],
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 + commands: [] + code_origin: '' + origin_filename: '' + requirements: [] + entry_points: + add_interface: + name: add_interface + doc: '' + parameters: + - name: cls + default: '' + - name: obj + type: DPOTrainer + default: '' + - name: restoration + type: MLRunInterfaceRestorationType + default: null + outputs: + - default: '' + lineno: 72 + mlrun_train: + name: mlrun_train + doc: '' + parameters: + - name: cls + default: '' + outputs: + - default: '' + lineno: 82 + wrapper: + name: wrapper + doc: '' + parameters: + - name: self + type: DPOTrainer + default: '' + outputs: + - default: '' + lineno: 83 + on_epoch_begin: + name: on_epoch_begin + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 131 + on_epoch_end: + name: on_epoch_end + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 142 + on_log: + name: on_log + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + - name: logs + type: Dict[str, float] + default: null + outputs: + - default: '' + lineno: 153 + on_train_begin: + name: on_train_begin + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 179 + on_train_end: + name: on_train_end + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + - name: model + type: PreTrainedModel + default: null + - name: tokenizer + type: PreTrainedTokenizer + default: null + outputs: + - default: '' + lineno: 190 + on_evaluate: + name: on_evaluate + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 203 + log_metrics: + name: log_metrics + doc: '' + parameters: + - name: self + default: '' + outputs: + - default: '' + lineno: 217 + log_metric_plot: + name: log_metric_plot + doc: '' + parameters: + - name: self + default: '' + - name: name + type: str + default: '' + - name: scores + type: List[float] + default: '' + outputs: + - default: '' + lineno: 224 + apply_mlrun: + name: apply_mlrun + doc: This is temporary and will be built in mlrun 1.5.0 + parameters: + - name: trainer + type: DPOTrainer + default: '' + - name: model_name + type: str + default: null + - name: tag + type: str + default: '' + - name: context + type: MLClientCtx + default: null + - name: auto_log + type: bool + default: true + - name: labels + type: Dict[str, str] + default: null + - name: extra_data + type: dict + default: null + outputs: + - default: '' + lineno: 246 + dpo_train: + name: dpo_train + doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ + \ dataset.\n The function takes various configuration parameters to customize\ + \ the training process\n and adapt the model to specific tasks using a provided\ + \ dataset." + parameters: + - name: context + type: MLClientCtx + doc: mlrun context in order to log trained model + default: '' + - name: train_dataset + type: Union[str, mlrun.datastore.DataItem] + doc: The train dataset used for fine-tuning the language model. + default: '' + - name: eval_dataset + type: str + doc: The eval dataset used for evaluate the language model during training. + default: null + - name: train_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: eval_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: dataset_columns_to_train + type: Union[str, list] + doc: which columns to pass to the model as inputs + default: text + - name: model + type: Union[str, List[str]] + doc: a tuple containing model name and class, or str with model name or path + default: huggingface-model + - name: tokenizer + type: Union[str, List[str]] + doc: a tuple containing tokenizer name and class, or str with tokenizer name + or path + default: null + - name: deepspeed_config + type: Union[dict, bool] + doc: Configuration options for DeepSpeed (optional). + default: false + - name: quantization_config + type: Union[dict, bool] + doc: Configuration options for model quantization (optional). + default: false + - name: peft_config + type: Union[dict, bool] + default: false + - name: beta + type: Union[float, bool] + default: false + - name: training_config + type: dict + doc: Configuration options specific to the fine-tuning training process (optional). + default: {} + - name: model_pretrained_config + type: dict + doc: config to load the pretrained model + default: {} + - name: tokenizer_pretrained_config + type: dict + doc: config to load the pretrained tokenizer + default: {} + - name: data_collator_config + type: dict + doc: Configuration options for data collation during training (optional). + default: {} + - name: task + type: str + doc: A description of the specific task the model is being fine-tuned for. + default: text-generation + - name: use_cuda + type: bool + doc: use gpu or not + default: true + - name: framework + type: str + doc: pt ot tf + default: pt + - name: device_map + type: str + default: auto + outputs: + - default: '' + lineno: 627 + evaluate: + name: evaluate + doc: 'Evaluating the model using perplexity, for more information visit: + + https://huggingface.co/docs/transformers/perplexity' + parameters: + - name: context + doc: mlrun context + default: '' + - name: model_path + doc: path to the model directory + default: '' + - name: data + type: DataFrame + doc: the data to evaluate the model + default: '' + - name: model_name + type: str + doc: name of base model + default: null + - name: tokenizer_name + type: str + doc: name of base tokenizer + default: null + outputs: + - default: '' + lineno: 785 + description: doing the alignment with dpo trainer + default_handler: dpo_train + disable_auto_mount: false + clone_target_dir: '' + env: [] + resources: + requests: + memory: 1Mi + cpu: 25m + limits: + memory: 20Gi + cpu: '2' + priority_class_name: igz-workload-medium + preemption_mode: prevent + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: app.iguazio.com/lifecycle + operator: NotIn + values: + - preemptible + - key: eks.amazonaws.com/capacityType + operator: NotIn + values: + - SPOT + - key: node-lifecycle + operator: NotIn + values: + - spot + tolerations: null + security_context: {} +verbose: false From b343632065b93709652d39314b93e6b2b59f94a5 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 19:20:18 +0000 Subject: [PATCH 16/33] rename the trainer --- huggingface_dpo/huggingface_dpo.py | 870 ++++++++++++++++++ huggingface_dpo/huggingface_dpo_trainer.py | 3 +- huggingface_dpo/test_huggingface_dpo.py | 56 ++ .../test_huggingface_dpo_trainer.py | 6 +- 4 files changed, 932 insertions(+), 3 deletions(-) create mode 100644 huggingface_dpo/huggingface_dpo.py create mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..8dcf63b29 --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,870 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from trl import DPOTrainer +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + peft_config = "peft_config" + data_collator = "data_collator" + beta = "beta" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: DPOTrainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: DPOTrainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: trl.DPOTrainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +PEFT_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=16, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = PEFT_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def dpo_train( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = trl.DPOTrainer( + model=model, + ref_model = None, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 5f2a680d0..0eb076dde 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -675,7 +675,8 @@ def dpo_train( # TODO: match forward.keyword to dataset.keyword - check if relevant in new design # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - + import pdb + pdb.set_trace() # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py new file mode 100644 index 000000000..b310aaf37 --- /dev/null +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -0,0 +1,56 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile + +import mlrun + + +def test_dpo_train(): + + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + auto_trainer = mlrun.import_function("function.yaml") + + training_arguments = { + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 1, + "warmup_steps": 2, + "max_steps": 10, + "learning_rate": 2e-4, + "logging_steps": 1, + } + + params = { + "model": (model_name, "transformers.AutoModelForCausalLM"), + "tokenizer": tokenizer, + "train_dataset": "HuggingFaceH4/orca_dpo_pairs", + "training_config": training_arguments, + "dataset_columns_to_train": "quote", + "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, + } + + try: + with tempfile.TemporaryDirectory() as test_directory: + auto_trainer.run( + local=True, + params=params, + handler="dpo_train", + returns=["model"], + workdir=test_directory, + ) + + except Exception as exception: + print(f"- The training failed - raised the following error:\n- {exception}") diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 7899debba..d2cfaaf02 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -7,7 +7,7 @@ def test_dpo_train(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") + dop_trainer = mlrun.import_function("function.yaml") training_arguments = { "per_device_train_batch_size": 4, @@ -20,17 +20,19 @@ def test_dpo_train(): params = { "model": (model_name, "transformers.AutoModelForCausalLM"), + "ref_model": None, "tokenizer": tokenizer, "train_dataset": "Abirate/english_quotes", "training_config": training_arguments, "dataset_columns_to_train": "quote", "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, } try: with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( + dpo_trainer.run( local=True, params=params, handler="dpo_train", From 6b28938560829ca49cb11f8570550e26333a50c7 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 19:21:00 +0000 Subject: [PATCH 17/33] get rid of the older version --- huggingface_dpo/huggingface_dpo.py | 870 ------------------------ huggingface_dpo/test_huggingface_dpo.py | 56 -- 2 files changed, 926 deletions(-) delete mode 100644 huggingface_dpo/huggingface_dpo.py delete mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py deleted file mode 100644 index 8dcf63b29..000000000 --- a/huggingface_dpo/huggingface_dpo.py +++ /dev/null @@ -1,870 +0,0 @@ -# Copyright 2024 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - peft_config = "peft_config" - data_collator = "data_collator" - beta = "beta" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: DPOTrainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: DPOTrainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: trl.DPOTrainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -PEFT_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=16, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = PEFT_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def dpo_train( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - peft_config: Union[dict, bool] = False, - beta: Union[float, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - ConfigKeys.peft_config: peft_config, - ConfigKeys.beta: beta, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = trl.DPOTrainer( - model=model, - ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py deleted file mode 100644 index b310aaf37..000000000 --- a/huggingface_dpo/test_huggingface_dpo.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2024 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile - -import mlrun - - -def test_dpo_train(): - - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "tokenizer": tokenizer, - "train_dataset": "HuggingFaceH4/orca_dpo_pairs", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( - local=True, - params=params, - handler="dpo_train", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") From 5239e5554a5553047c2f3f3b0fe67df5b75fcb7a Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 01:40:22 +0000 Subject: [PATCH 18/33] can trigger the run. seems don't need override the dataloader --- huggingface_dpo/huggingface_dpo_trainer.py | 53 ++++++++++++------- huggingface_dpo/requirements.txt | 1 + .../test_huggingface_dpo_trainer.py | 52 ++++++++++++++++++ 3 files changed, 87 insertions(+), 19 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 0eb076dde..64389c23c 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -1,3 +1,17 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import importlib import os import shutil @@ -244,7 +258,7 @@ def log_metric_plot(self, name: str, scores: List[float]): def apply_mlrun( - trainer: trl.DPOTrainer, + trainer: DPOTrainer, model_name: str = None, tag: str = "", context: mlrun.MLClientCtx = None, @@ -675,8 +689,6 @@ def dpo_train( # TODO: match forward.keyword to dataset.keyword - check if relevant in new design # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - import pdb - pdb.set_trace() # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, @@ -710,21 +722,24 @@ def dpo_train( model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - + whole_dataset = load_dataset(train_dataset, split='train') + whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3) + train_dataset = whole_dataset['train'] + eval_dataset = whole_dataset['test'] # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) + #tokenized_train, tokenized_eval = _prepare_dataset( + # train_dataset=train_dataset, + # eval_dataset=eval_dataset, + # train_load_dataset_kwargs=train_load_dataset_kwargs, + # eval_load_dataset_kwargs=eval_load_dataset_kwargs, + # tokenizer=tokenizer, + # dataset_columns_to_train=dataset_columns_to_train, + #) # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) + #data_collator = transformers.DataCollatorForLanguageModeling( + # tokenizer=tokenizer, mlm=False, **data_collator_config + #) # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] @@ -742,15 +757,15 @@ def dpo_train( **train_kwargs, ) - trainer = trl.DPOTrainer( + trainer = DPOTrainer( model=model, ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, + train_dataset=train_dataset, + eval_dataset=eval_dataset, peft_config=configs[ConfigKeys.peft_config], beta = configs[ConfigKeys.beta], tokenizer=tokenizer, - data_collator=data_collator, + #data_collator=data_collator, args=training_args, ) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index 215b90562..c03846397 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -5,3 +5,4 @@ datasets plotly trl mlrun +bitsandbytes diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index d2cfaaf02..fcd373759 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -1,7 +1,59 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import tempfile +from huggingface_dpo_trainer import dpo_train import mlrun +def test_dpo_fn(): + ctx = mlrun.get_or_create_ctx(name='test_dpo') + train_dataset = "unalignment/toxic-dpo-v0.2" + training_arguments = { + "evaluation_strategy": "steps", + "do_eval": True, + "optim": "paged_adamw_8bit", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "log_level": "info", + "save_steps": 100, + "learning_rate": 5e-7, + "eval_steps": 100, + "num_train_epochs": 1, + "max_steps": 100, + "warmup_steps": 20, + "fp16": True, + "lr_scheduler_type": "cosine", + "remove_unused_columns": True, + "gradient_checkpointing": True, + } + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + dpo_train( + context = ctx, + train_dataset = train_dataset, + model = (model_name,"transformers.AutoModelForCausalLM"), + tokenizer = tokenizer, + dataset_columns_to_train = ['chosen', 'rejected'], + training_config = training_arguments, + use_cuda = True, + beta = 0.1, + split='train', + ) + + def test_dpo_train(): From 1f059b891203de9da7f365d546a622310c8e6722 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 13:07:15 -0700 Subject: [PATCH 19/33] adding the maxlength --- huggingface_dpo/test_huggingface_dpo_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index fcd373759..6a434f7dc 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -51,6 +51,8 @@ def test_dpo_fn(): use_cuda = True, beta = 0.1, split='train', + max_length=1024, + max_prompt_length=2048, ) From e5d079249016d264090d9840736dc261203b05ae Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 15:42:16 -0700 Subject: [PATCH 20/33] get rid of the trainer interface --- huggingface_dpo/huggingface_dpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 64389c23c..349d98e1b 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -37,7 +37,7 @@ from plotly import graph_objects as go from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, + PreTrainedModel, PreTrainedTokenizer, TrainerCallback, TrainerControl, TrainerState, TrainingArguments) From eb300fd531888062b6f387d4c760a15735b3fa45 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 22:43:04 +0000 Subject: [PATCH 21/33] override --- huggingface_dpo/huggingface_dpo_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 64389c23c..844d6b69d 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -767,6 +767,8 @@ def dpo_train( tokenizer=tokenizer, #data_collator=data_collator, args=training_args, + max_length=1024, + max_prompt_length=2048, ) apply_mlrun(trainer, model_name=model_name.split("/")[-1]) From 14be77620ce1bfef106bc5dbf3aedaccd3372945 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 26 Mar 2024 21:51:09 +0000 Subject: [PATCH 22/33] training job can run but the artifact can't store --- huggingface_dpo/huggingface_dpo_trainer.py | 30 ++++--------------- .../test_huggingface_dpo_trainer.py | 16 +++++----- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index eddd74e8c..e50cb64af 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -41,19 +41,6 @@ TrainerCallback, TrainerControl, TrainerState, TrainingArguments) -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - class ConfigKeys: deepspeed = "deepspeed" @@ -61,7 +48,7 @@ class ConfigKeys: training = "training" tokenizer_pretrained = "tokenizer_pretrained" model_pretrained = "model_pretrained" - peft_config = "peft_config" + peft_config = "peft" data_collator = "data_collator" beta = "beta" @@ -317,7 +304,7 @@ def _print_trainable_parameters(model): ) PEFT_CONFIG = peft.LoraConfig( - r=8, + r=16, lora_alpha=16, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], @@ -360,7 +347,7 @@ def _update_config(src: dict, dst: dict): if config is True and config_name == "quantization": config = QUANTIZATION_CONFIG - if config is True and config_name == "lora": + if config is True and config_name == "peft": config = PEFT_CONFIG if config is True and config_name == "deepspeed": @@ -423,11 +410,6 @@ def _set_model_and_tokenizer( :returns: model and tokenizer """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - # load model from store if isinstance(model, str) and is_store_uri(model): pass @@ -702,6 +684,7 @@ def dpo_train( } _update_config(dst=configs, src=kwargs) + # check gpu permission and availability if use_cuda: if torch.cuda.is_available(): @@ -765,10 +748,9 @@ def dpo_train( peft_config=configs[ConfigKeys.peft_config], beta = configs[ConfigKeys.beta], tokenizer=tokenizer, - #data_collator=data_collator, args=training_args, - max_length=1024, - max_prompt_length=2048, + max_length=2048, + max_prompt_length=4096, ) apply_mlrun(trainer, model_name=model_name.split("/")[-1]) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 6a434f7dc..1f3a9a772 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -20,6 +20,7 @@ def test_dpo_fn(): ctx = mlrun.get_or_create_ctx(name='test_dpo') train_dataset = "unalignment/toxic-dpo-v0.2" + training_arguments = { "evaluation_strategy": "steps", "do_eval": True, @@ -28,12 +29,12 @@ def test_dpo_fn(): "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 100, + "save_steps": 2, "learning_rate": 5e-7, - "eval_steps": 100, + "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 100, - "warmup_steps": 20, + "max_steps": 10, + "warmup_steps": 5, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, @@ -44,15 +45,12 @@ def test_dpo_fn(): dpo_train( context = ctx, train_dataset = train_dataset, - model = (model_name,"transformers.AutoModelForCausalLM"), + peft_config=True, + model = model_name, tokenizer = tokenizer, - dataset_columns_to_train = ['chosen', 'rejected'], training_config = training_arguments, use_cuda = True, beta = 0.1, - split='train', - max_length=1024, - max_prompt_length=2048, ) From 8ed0555aea873e0bc4e840f64cbd6e58bcae9b2f Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 27 Mar 2024 00:02:26 +0000 Subject: [PATCH 23/33] why the artifact can be stored? --- huggingface_dpo/huggingface_dpo_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index e50cb64af..9f5c00e19 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -241,7 +241,9 @@ def log_metric_plot(self, name: str, scores: List[float]): # Create the plotly artifact: artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) + import pdb + pdb.set_trace() + #self._artifacts[artifact_name] = self._context.log_artifact(artifact) def apply_mlrun( From 465c2087cb8a630c703f4a66ec7bfa6770965b32 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 27 Mar 2024 21:36:52 +0000 Subject: [PATCH 24/33] solved the naming issue, now can store the artifact --- huggingface_dpo/huggingface_dpo_trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 9f5c00e19..fa534f631 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -239,11 +239,11 @@ def log_metric_plot(self, name: str, scores: List[float]): ) # Create the plotly artifact: + if '/' in name: + name = '_'.join(name.split('/')) artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - import pdb - pdb.set_trace() - #self._artifacts[artifact_name] = self._context.log_artifact(artifact) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) def apply_mlrun( From 308d94f4bb08ab9a2b3170891bdfed70b9b507b6 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 04:59:04 +0000 Subject: [PATCH 25/33] testing --- huggingface_dpo/huggingface_dpo_trainer.py | 115 +++--------------- .../test_huggingface_dpo_trainer.py | 17 +-- 2 files changed, 29 insertions(+), 103 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index fa534f631..823f83148 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -384,8 +384,6 @@ def _get_class_object(class_path: str) -> type: module_path, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_path) return getattr(module, class_name) - - def _set_model_and_tokenizer( model: Union[str, List[str]], tokenizer: Union[str, List[str]], @@ -490,7 +488,6 @@ def _set_model_and_tokenizer( return model_name, model, tokenizer - def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: """ loads the specific dataset provided by the user @@ -517,6 +514,7 @@ def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: return dataset.get("eval") elif "validation" in dataset: return dataset.get("validation") + return dataset def _prepare_dataset( @@ -524,8 +522,6 @@ def _prepare_dataset( eval_dataset: str, train_load_dataset_kwargs, eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], ) -> (Dataset, Union[Dataset, None]): """ Loads the train and eval datasets (if provided) passes them through the tokenizer and @@ -533,34 +529,11 @@ def _prepare_dataset( :param train_dataset: the name or path to the train dataset :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) :param train_load_dataset_kwargs: kwargs for dataset loading :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through :returns: tokenized datasets """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) # Load datasets # if provided two paths/names we load each separately using designated func @@ -571,7 +544,6 @@ def _prepare_dataset( eval_dataset = _dataset_loader( dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs ) - # if only on path is given then we must check if it contains both dataset or if only one should be used else: dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) @@ -584,42 +556,13 @@ def _prepare_dataset( elif "validation" in dataset: eval_dataset = dataset.get("validation") else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) + return train_dataset else: logger.error("train dataset is mandatory") raise KeyError("no train dataset found in given dataset") - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset + return train_dataset, eval_dataset def dpo_train( @@ -628,7 +571,6 @@ def dpo_train( eval_dataset: str = None, train_load_dataset_kwargs: dict = {}, eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", model: Union[str, List[str]] = "huggingface-model", tokenizer: Union[str, List[str]] = None, deepspeed_config: Union[dict, bool] = False, @@ -637,8 +579,8 @@ def dpo_train( beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config : dict={}, task: str = "text-generation", use_cuda: bool = True, framework: str = "pt", @@ -646,33 +588,31 @@ def dpo_train( **kwargs, ): """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + Form a dpo training job to do llm alignment The function takes various configuration parameters to customize the training process and adapt the model to specific tasks using a provided dataset. :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path :param train_dataset: The train dataset used for fine-tuning the language model. :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path :param deepspeed_config: Configuration options for DeepSpeed (optional). :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param peft_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param beta: super parameter of KL divergence :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param model_pretrained_config: config to load the pretrained model + :param tokenizer_pretrained_config: config to load the pretrained tokenizer :param data_collator_config: Configuration options for data collation during training (optional). :param task: A description of the specific task the model is being fine-tuned for. + :param use_cuda: use gpu or not + :param framework: pt ot tf :param kwargs: Additional keyword arguments. """ - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, @@ -699,33 +639,16 @@ def dpo_train( model_name, model, tokenizer = _set_model_and_tokenizer( model=model, tokenizer=tokenizer, - task=task, framework=framework, + task = task, quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - whole_dataset = load_dataset(train_dataset, split='train') - whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3) - train_dataset = whole_dataset['train'] - eval_dataset = whole_dataset['test'] - # Load datasets - #tokenized_train, tokenized_eval = _prepare_dataset( - # train_dataset=train_dataset, - # eval_dataset=eval_dataset, - # train_load_dataset_kwargs=train_load_dataset_kwargs, - # eval_load_dataset_kwargs=eval_load_dataset_kwargs, - # tokenizer=tokenizer, - # dataset_columns_to_train=dataset_columns_to_train, - #) - - # Initialize the data collator for the trainer to use in order to create batches of data - #data_collator = transformers.DataCollatorForLanguageModeling( - # tokenizer=tokenizer, mlm=False, **data_collator_config - #) - + train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs) + # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 1f3a9a772..64ec36886 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -18,40 +18,43 @@ import mlrun def test_dpo_fn(): + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + #dop_trainer = mlrun.import_function("function.yaml") + ctx = mlrun.get_or_create_ctx(name='test_dpo') train_dataset = "unalignment/toxic-dpo-v0.2" - + eval_dataset = "unalignment/toxic-dpo-v0.2" training_arguments = { "evaluation_strategy": "steps", - "do_eval": True, + "do_eval": False, "optim": "paged_adamw_8bit", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 2, + "save_steps": 5, "learning_rate": 5e-7, "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 10, + "max_steps": 5, "warmup_steps": 5, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, "gradient_checkpointing": True, } - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name dpo_train( context = ctx, train_dataset = train_dataset, + eval_dataset = eval_dataset, peft_config=True, model = model_name, tokenizer = tokenizer, training_config = training_arguments, use_cuda = True, beta = 0.1, - ) + ) From b660dd7134437cfcf0bccc60263cabc80d37eea4 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:00:57 +0000 Subject: [PATCH 26/33] fmt --- huggingface_dpo/huggingface_dpo_trainer.py | 56 +++++++++++------ .../test_huggingface_dpo_trainer.py | 61 +++++++++---------- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 823f83148..1f5154a7b 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -32,14 +32,20 @@ from mlrun.frameworks._common import CommonTypes, MLRunInterface from mlrun.utils import logger from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + PreTrainedModel, + PreTrainedTokenizer, + TrainerCallback, + TrainerControl, + TrainerState, + TrainingArguments, +) class ConfigKeys: @@ -239,8 +245,8 @@ def log_metric_plot(self, name: str, scores: List[float]): ) # Create the plotly artifact: - if '/' in name: - name = '_'.join(name.split('/')) + if "/" in name: + name = "_".join(name.split("/")) artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) self._artifacts[artifact_name] = self._context.log_artifact(artifact) @@ -308,8 +314,15 @@ def _print_trainable_parameters(model): PEFT_CONFIG = peft.LoraConfig( r=16, lora_alpha=16, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"], + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", @@ -384,6 +397,8 @@ def _get_class_object(class_path: str) -> type: module_path, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_path) return getattr(module, class_name) + + def _set_model_and_tokenizer( model: Union[str, List[str]], tokenizer: Union[str, List[str]], @@ -488,6 +503,7 @@ def _set_model_and_tokenizer( return model_name, model, tokenizer + def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: """ loads the specific dataset provided by the user @@ -561,7 +577,6 @@ def _prepare_dataset( logger.error("train dataset is mandatory") raise KeyError("no train dataset found in given dataset") - return train_dataset, eval_dataset @@ -579,8 +594,8 @@ def dpo_train( beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config : dict={}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, task: str = "text-generation", use_cuda: bool = True, framework: str = "pt", @@ -626,7 +641,6 @@ def dpo_train( } _update_config(dst=configs, src=kwargs) - # check gpu permission and availability if use_cuda: if torch.cuda.is_available(): @@ -640,15 +654,17 @@ def dpo_train( model=model, tokenizer=tokenizer, framework=framework, - task = task, + task=task, quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs) - + train_dataset, eval_dataset = _prepare_dataset( + train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs + ) + # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] @@ -667,11 +683,11 @@ def dpo_train( trainer = DPOTrainer( model=model, - ref_model = None, + ref_model=None, train_dataset=train_dataset, eval_dataset=eval_dataset, peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], + beta=configs[ConfigKeys.beta], tokenizer=tokenizer, args=training_args, max_length=2048, diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 64ec36886..f073aafb5 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -17,46 +17,46 @@ import mlrun + def test_dpo_fn(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - #dop_trainer = mlrun.import_function("function.yaml") + # dop_trainer = mlrun.import_function("function.yaml") - ctx = mlrun.get_or_create_ctx(name='test_dpo') + ctx = mlrun.get_or_create_ctx(name="test_dpo") train_dataset = "unalignment/toxic-dpo-v0.2" eval_dataset = "unalignment/toxic-dpo-v0.2" training_arguments = { - "evaluation_strategy": "steps", - "do_eval": False, - "optim": "paged_adamw_8bit", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 1, - "log_level": "info", - "save_steps": 5, - "learning_rate": 5e-7, - "eval_steps": 1, - "num_train_epochs": 1, - "max_steps": 5, - "warmup_steps": 5, - "fp16": True, - "lr_scheduler_type": "cosine", - "remove_unused_columns": True, - "gradient_checkpointing": True, - } + "evaluation_strategy": "steps", + "do_eval": False, + "optim": "paged_adamw_8bit", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "log_level": "info", + "save_steps": 5, + "learning_rate": 5e-7, + "eval_steps": 1, + "num_train_epochs": 1, + "max_steps": 5, + "warmup_steps": 5, + "fp16": True, + "lr_scheduler_type": "cosine", + "remove_unused_columns": True, + "gradient_checkpointing": True, + } dpo_train( - context = ctx, - train_dataset = train_dataset, - eval_dataset = eval_dataset, - peft_config=True, - model = model_name, - tokenizer = tokenizer, - training_config = training_arguments, - use_cuda = True, - beta = 0.1, + context=ctx, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=True, + model=model_name, + tokenizer=tokenizer, + training_config=training_arguments, + use_cuda=True, + beta=0.1, ) - def test_dpo_train(): @@ -81,7 +81,6 @@ def test_dpo_train(): "training_config": training_arguments, "dataset_columns_to_train": "quote", "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, } From 3fe14517bead761f82e68ca7e8e07940a422c48e Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:13:27 +0000 Subject: [PATCH 27/33] update the function yaml file --- huggingface_dpo/function.yaml | 45 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml index d0baab33a..c3593fa63 100644 --- a/huggingface_dpo/function.yaml +++ b/huggingface_dpo/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: huggingface-dpo-trainer tag: '' - hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676 + hash: 584b20584f58bfa89225b6999e6b55ad017dd87a project: '' labels: author: pgw @@ -14,7 +14,7 @@ spec: args: [] image: mlrun/mlrun build: - functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from trl import DPOTrainer
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    peft_config = "peft_config"
    data_collator = "data_collator"
    beta = "beta"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: DPOTrainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: DPOTrainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: trl.DPOTrainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

PEFT_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = PEFT_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def dpo_train(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    peft_config: Union[dict, bool] = False,
    beta: Union[float, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
        ConfigKeys.peft_config: peft_config,
        ConfigKeys.beta: beta,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = trl.DPOTrainer(
        model=model,
        ref_model = None,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        peft_config=configs[ConfigKeys.peft_config],
        beta = configs[ConfigKeys.beta],
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from trl import DPOTrainer
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from plotly import graph_objects as go
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    PreTrainedModel,
    PreTrainedTokenizer,
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    peft_config = "peft"
    data_collator = "data_collator"
    beta = "beta"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: DPOTrainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: DPOTrainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        if "/" in name:
            name = "_".join(name.split("/"))
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: DPOTrainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

PEFT_CONFIG = peft.LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "peft":
            config = PEFT_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")
    return dataset


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading

    :returns: tokenized datasets
    """

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )
    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                return train_dataset
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    return train_dataset, eval_dataset


def dpo_train(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    peft_config: Union[dict, bool] = False,
    beta: Union[float, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Form a dpo training job to do llm alignment
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param peft_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param beta: super parameter of KL divergence
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param use_cuda: use gpu or not
    :param framework: pt ot tf
    :param kwargs: Additional keyword arguments.
    """

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
        ConfigKeys.peft_config: peft_config,
        ConfigKeys.beta: beta,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        framework=framework,
        task=task,
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )
    train_dataset, eval_dataset = _prepare_dataset(
        train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = DPOTrainer(
        model=model,
        ref_model=None,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=configs[ConfigKeys.peft_config],
        beta=configs[ConfigKeys.beta],
        tokenizer=tokenizer,
        args=training_args,
        max_length=2048,
        max_prompt_length=4096,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 commands: [] code_origin: '' origin_filename: '' @@ -34,7 +34,7 @@ spec: default: null outputs: - default: '' - lineno: 72 + lineno: 79 mlrun_train: name: mlrun_train doc: '' @@ -43,7 +43,7 @@ spec: default: '' outputs: - default: '' - lineno: 82 + lineno: 89 wrapper: name: wrapper doc: '' @@ -53,7 +53,7 @@ spec: default: '' outputs: - default: '' - lineno: 83 + lineno: 90 on_epoch_begin: name: on_epoch_begin doc: '' @@ -71,7 +71,7 @@ spec: default: '' outputs: - default: '' - lineno: 131 + lineno: 138 on_epoch_end: name: on_epoch_end doc: '' @@ -89,7 +89,7 @@ spec: default: '' outputs: - default: '' - lineno: 142 + lineno: 149 on_log: name: on_log doc: '' @@ -110,7 +110,7 @@ spec: default: null outputs: - default: '' - lineno: 153 + lineno: 160 on_train_begin: name: on_train_begin doc: '' @@ -128,7 +128,7 @@ spec: default: '' outputs: - default: '' - lineno: 179 + lineno: 186 on_train_end: name: on_train_end doc: '' @@ -152,7 +152,7 @@ spec: default: null outputs: - default: '' - lineno: 190 + lineno: 197 on_evaluate: name: on_evaluate doc: '' @@ -170,7 +170,7 @@ spec: default: '' outputs: - default: '' - lineno: 203 + lineno: 210 log_metrics: name: log_metrics doc: '' @@ -179,7 +179,7 @@ spec: default: '' outputs: - default: '' - lineno: 217 + lineno: 224 log_metric_plot: name: log_metric_plot doc: '' @@ -194,7 +194,7 @@ spec: default: '' outputs: - default: '' - lineno: 224 + lineno: 231 apply_mlrun: name: apply_mlrun doc: This is temporary and will be built in mlrun 1.5.0 @@ -222,13 +222,12 @@ spec: default: null outputs: - default: '' - lineno: 246 + lineno: 255 dpo_train: name: dpo_train - doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ - \ dataset.\n The function takes various configuration parameters to customize\ - \ the training process\n and adapt the model to specific tasks using a provided\ - \ dataset." + doc: "Form a dpo training job to do llm alignment\n The function takes various\ + \ configuration parameters to customize the training process\n and adapt the\ + \ model to specific tasks using a provided dataset." parameters: - name: context type: MLClientCtx @@ -250,10 +249,6 @@ spec: type: dict doc: kwargs for dataset loading default: {} - - name: dataset_columns_to_train - type: Union[str, list] - doc: which columns to pass to the model as inputs - default: text - name: model type: Union[str, List[str]] doc: a tuple containing model name and class, or str with model name or path @@ -273,9 +268,11 @@ spec: default: false - name: peft_config type: Union[dict, bool] + doc: Configuration options for Low-Rank Approximation (LoRA) (optional). default: false - name: beta type: Union[float, bool] + doc: super parameter of KL divergence default: false - name: training_config type: dict @@ -310,7 +307,7 @@ spec: default: auto outputs: - default: '' - lineno: 627 + lineno: 583 evaluate: name: evaluate doc: 'Evaluating the model using perplexity, for more information visit: @@ -337,7 +334,7 @@ spec: default: null outputs: - default: '' - lineno: 785 + lineno: 726 description: doing the alignment with dpo trainer default_handler: dpo_train disable_auto_mount: false From bbc2fa2001f7b7eb39a845526fc882eb985c6e88 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:13:45 +0000 Subject: [PATCH 28/33] update the test case --- huggingface_dpo/test_huggingface_dpo_trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index f073aafb5..63f3f50c8 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -31,15 +31,15 @@ def test_dpo_fn(): "do_eval": False, "optim": "paged_adamw_8bit", "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 1, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 5, + "save_steps": 1, "learning_rate": 5e-7, "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 5, - "warmup_steps": 5, + "max_steps": 1, + "warmup_steps": 1, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, From c781ecf4f1aec77f2d9a692330975ba3171b0f8e Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:17:14 +0000 Subject: [PATCH 29/33] passed the test case --- .../test_huggingface_dpo_trainer.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 63f3f50c8..1aa31707e 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -21,7 +21,6 @@ def test_dpo_fn(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - # dop_trainer = mlrun.import_function("function.yaml") ctx = mlrun.get_or_create_ctx(name="test_dpo") train_dataset = "unalignment/toxic-dpo-v0.2" @@ -56,43 +55,3 @@ def test_dpo_fn(): use_cuda=True, beta=0.1, ) - - -def test_dpo_train(): - - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name - dop_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "ref_model": None, - "tokenizer": tokenizer, - "train_dataset": "Abirate/english_quotes", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - dpo_trainer.run( - local=True, - params=params, - handler="dpo_train", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") From 2f5361e72827d84c70dd240a5857b6b9e8459785 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:26:38 +0000 Subject: [PATCH 30/33] adding the function yaml to the test case --- .../test_huggingface_dpo_trainer.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 1aa31707e..98783c644 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -13,12 +13,13 @@ # limitations under the License. import tempfile -from huggingface_dpo_trainer import dpo_train +# from huggingface_dpo_trainer import dpo_train import mlrun def test_dpo_fn(): + dpo_trainer = mlrun.import_function("function.yaml") model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name @@ -44,14 +45,24 @@ def test_dpo_fn(): "remove_unused_columns": True, "gradient_checkpointing": True, } - dpo_train( - context=ctx, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - peft_config=True, - model=model_name, - tokenizer=tokenizer, - training_config=training_arguments, - use_cuda=True, - beta=0.1, - ) + params = { + "model": model_name, + "tokenizer": tokenizer, + "train_dataset": train_dataset, + "eval_dataset": eval_dataset, + "peft_config": True, + "training_config": training_arguments, + "use_cuda": True, + "beta": 0.1, + } + try: + with tempfile.TemporaryDirectory() as test_directory: + dpo_trainer.run( + local=True, + params=params, + handler="dpo_train", + returns=["model"], + workdir=test_directory, + ) + except Exception as exception: + print(f"-The training failed -raised the following error: \n -{exception}") From d63b755d4e84f58184529cb8a81f9344d07278c5 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:30:47 +0000 Subject: [PATCH 31/33] should be good for the notebook --- huggingface_dpo/test_huggingface_dpo_trainer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 98783c644..db289b51e 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -13,8 +13,6 @@ # limitations under the License. import tempfile - -# from huggingface_dpo_trainer import dpo_train import mlrun From 5d1ccc444d89cd041c4ffe2d06da384f5bdf1507 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:54:43 +0000 Subject: [PATCH 32/33] adding the notebook and raise the PR --- huggingface_dpo/huggingface_dpo_trainer.ipynb | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 huggingface_dpo/huggingface_dpo_trainer.ipynb diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb new file mode 100644 index 000000000..b0b0f60ae --- /dev/null +++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d", + "metadata": {}, + "source": [ + "# DPO trainer for llm alignment" + ] + }, + { + "cell_type": "markdown", + "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b", + "metadata": {}, + "source": [ + "## Notebook Introduction: Doing the llm alignment with DPO trainer\n", + "\n", + "In this notebook, we will walk you through a step-by-step process of how to do alignment for a SOTA llm with DPO method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness." + ] + }, + { + "cell_type": "markdown", + "id": "425249e9-f43f-45e6-aa25-9f53099049cd", + "metadata": {}, + "source": [ + "### First, we will select the model we wish to align and take the matching tokenizer and appropriate config" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n", + "from transformers import logging\n", + "\n", + "logging.set_verbosity(\"CRITICAL\")\n", + "\n", + "model_name = \"mistralai/Mistral-7B-Instruct-v0.2\"\n", + "tokenizer = model_name\n", + "generation_config = GenerationConfig.from_pretrained(model_name)" + ] + }, + { + "cell_type": "markdown", + "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230", + "metadata": {}, + "source": [ + "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:49:17,440 [info] Project loaded successfully: {'project_name': 'dpo-trainer-test'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "\n", + "project = mlrun.get_or_create_project(\n", + " name=\"dpo-trainer-test\",\n", + " context=\"./\",\n", + " user_project=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d56b834f-adf6-4736-8de7-3348e050f561", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.set_function(\n", + " \"huggingface_dpo_trainer.py\",\n", + " name=\"dpo-trainer\",\n", + " kind=\"local\",\n", + " handler=\"dpo_train\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47", + "metadata": {}, + "source": [ + "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b", + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n", + "eval_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n", + "training_arguments = {\n", + " \"evaluation_strategy\": \"steps\",\n", + " \"do_eval\": True,\n", + " \"optim\": \"paged_adamw_8bit\",\n", + " \"per_device_train_batch_size\": 1,\n", + " \"gradient_accumulation_steps\": 1,\n", + " \"per_device_eval_batch_size\": 1,\n", + " \"log_level\": \"info\",\n", + " \"save_steps\": 1,\n", + " \"learning_rate\": 5e-7,\n", + " \"eval_steps\": 1,\n", + " \"num_train_epochs\": 1,\n", + " \"max_steps\": 1,\n", + " \"warmup_steps\": 1,\n", + " \"fp16\": True,\n", + " \"lr_scheduler_type\": \"cosine\",\n", + " \"remove_unused_columns\": True,\n", + " \"gradient_checkpointing\": True,\n", + "}\n", + "params = {\n", + " \"model\": model_name,\n", + " \"tokenizer\": tokenizer,\n", + " \"train_dataset\": train_dataset,\n", + " \"eval_dataset\": eval_dataset,\n", + " \"peft_config\": True,\n", + " \"training_config\": training_arguments,\n", + " \"use_cuda\": True,\n", + " \"beta\": 0.1,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4", + "metadata": {}, + "source": [ + "### Now we simply run the function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11ab5888-5870-4bf8-9657-db930adecd77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:49:20,738 [info] Storing function: {'name': 'dpo-trainer', 'uid': 'b4ed0d2bdc8c4e44892aee1a3549969d', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3a28ff59fc674c4aac2e2ee2d1bf0211", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00 2024-04-01 16:49:40,542 [info] training 'mistralai/Mistral-7B-Instruct-v0.2'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running training *****\n", + " Num examples = 541\n", + " Num Epochs = 1\n", + " Instantaneous batch size per device = 1\n", + " Total train batch size (w. parallel, distributed & accumulation) = 1\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 1\n", + " Number of trainable parameters = 41,943,040\n", + "torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n", + "None of the inputs have requires_grad=True. Gradients will be None\n", + "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n", + "***** Running Evaluation *****\n", + " Num examples = 541\n", + " Batch size = 1\n" + ] + } + ], + "source": [ + "training_run = mlrun.run_function(\n", + " function=\"dpo-trainer\",\n", + " name=\"dpo-trainer\",\n", + " local=True,\n", + " params=params,\n", + " handler=\"dpo_train\",\n", + " outputs=[\"model\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dpo", + "language": "python", + "name": "conda-env-.conda-dpo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From bf66dfd8944d3532431a3e09687e4310c72bd3f0 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:57:34 +0000 Subject: [PATCH 33/33] raise the PR --- huggingface_dpo/huggingface_dpo_trainer.ipynb | 322 +++++++++++++++++- 1 file changed, 320 insertions(+), 2 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb index b0b0f60ae..07dfcf024 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.ipynb +++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "11ab5888-5870-4bf8-9657-db930adecd77", "metadata": {}, "outputs": [ @@ -229,7 +229,325 @@ "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n", "***** Running Evaluation *****\n", " Num examples = 541\n", - " Batch size = 1\n" + " Batch size = 1\n", + "Saving model checkpoint to /tmp/tmp1k687jql/tmp-checkpoint-1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'eval_train_loss': 0.6931472420692444, 'eval_train_runtime': 365.1876, 'eval_train_samples_per_second': 1.481, 'eval_train_steps_per_second': 1.481, 'eval_rewards/chosen': 0.0, 'eval_rewards/rejected': 0.0, 'eval_rewards/accuracies': 0.0, 'eval_rewards/margins': 0.0, 'eval_logps/rejected': -127.08296203613281, 'eval_logps/chosen': -328.57867431640625, 'eval_logits/rejected': -2.3305602073669434, 'eval_logits/chosen': -2.911039113998413, 'epoch': 0.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n", + "Model config MistralConfig {\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.38.2\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "tokenizer config file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/tokenizer_config.json\n", + "Special tokens file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/special_tokens_map.json\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Saving model checkpoint to /tmp/tmpe5yijcu0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'train_runtime': 367.9669, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'train_loss': 0.6931471824645996, 'epoch': 0.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n", + "Model config MistralConfig {\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.38.2\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "tokenizer config file saved in /tmp/tmpe5yijcu0/tokenizer_config.json\n", + "Special tokens file saved in /tmp/tmpe5yijcu0/special_tokens_map.json\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
dpo-trainer-test-pengwei0Apr 01 16:49:20completeddpo-trainer
v3io_user=pengwei
kind=local
owner=pengwei
host=jupyter-pengwei-gpu-86c58c8f79-8ls8j
model=mistralai/Mistral-7B-Instruct-v0.2
tokenizer=mistralai/Mistral-7B-Instruct-v0.2
train_dataset=unalignment/toxic-dpo-v0.2
eval_dataset=unalignment/toxic-dpo-v0.2
peft_config=True
training_config={'evaluation_strategy': 'steps', 'do_eval': False, 'optim': 'paged_adamw_8bit', 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 1, 'per_device_eval_batch_size': 1, 'log_level': 'info', 'save_steps': 1, 'learning_rate': 5e-07, 'eval_steps': 1, 'num_train_epochs': 1, 'max_steps': 1, 'warmup_steps': 1, 'fp16': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': True, 'gradient_checkpointing': True}
use_cuda=True
beta=0.1
eval_train_loss=0.6931472420692444
eval_train_runtime=365.1876
eval_train_samples_per_second=1.481
eval_train_steps_per_second=1.481
eval_rewards/chosen=0.0
eval_rewards/rejected=0.0
eval_rewards/accuracies=0.0
eval_rewards/margins=0.0
eval_logps/rejected=-127.08296203613281
eval_logps/chosen=-328.57867431640625
eval_logits/rejected=-2.3305602073669434
eval_logits/chosen=-2.911039113998413
train_runtime=367.9669
train_samples_per_second=0.003
train_steps_per_second=0.003
total_flos=0.0
train_loss=0.6931471824645996
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:55:57,867 [info] Run execution finished: {'status': 'completed', 'name': 'dpo-trainer'}\n" ] } ],