From 549ec1b6c7a5dadf2bd9efc349886462dd3c4279 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 6 Mar 2024 12:55:42 -0800
Subject: [PATCH 01/33] make some changes for the auto trainer using the DPO
 trainer

---
 huggingface_dpo/huggingface_auto_trainer.py | 855 ++++++++++++++++++++
 huggingface_dpo/huggingface_dpo.py          | 855 ++++++++++++++++++++
 2 files changed, 1710 insertions(+)
 create mode 100644 huggingface_dpo/huggingface_auto_trainer.py
 create mode 100644 huggingface_dpo/huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py
new file mode 100644
index 000000000..d1166318c
--- /dev/null
+++ b/huggingface_dpo/huggingface_auto_trainer.py
@@ -0,0 +1,855 @@
+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+class ConfigKeys:
+    deepspeed = "deepspeed"
+    quantization = "quantization"
+    lora = "lora"
+    training = "training"
+    tokenizer_pretrained = "tokenizer_pretrained"
+    model_pretrained = "model_pretrained"
+    data_collator = "data_collator"
+
+
+# ----------------------from MLRUN--------------------------------
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: Trainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+        def wrapper(self: Trainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+        if self._is_training:
+            return
+
+    def log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self.log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def apply_mlrun(
+    trainer: transformers.Trainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    """
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+# ----------------------end from MLRUN--------------------------------
+
+
+def _print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
+        f" {100 * trainable_params / all_param}"
+    )
+
+
+# default configs
+# will be used if user provides "True" with config name as input
+QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+LORA_CONFIG = peft.LoraConfig(
+    r=8,
+    lora_alpha=32,
+    target_modules=["query_key_value"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+DEEPSPEED_CONFIG = {
+    "train_micro_batch_size_per_gpu": "auto",
+    "fp16": {"enabled": True},
+    "autotuning": {
+        "enabled": True,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
+        },
+    },
+    "zero_optimization": {
+        "stage": 2,
+    },
+}
+
+
+def _update_config(src: dict, dst: dict):
+    """
+    update configs according to user, this way the user can add/modify values in default configs for e.g.
+
+    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
+     with the prefix and add them to appropriate config
+
+    :param src: dict of all candidate values to update dict.
+    :param dst: dict containing all configs to update.
+    """
+
+    for config_name, config in dst.items():
+
+        # If given True we use default dict
+        # Can also be False or a config dict given from user, so we check specifically fo True
+        if config is True and config_name == "quantization":
+            config = QUANTIZATION_CONFIG
+
+        if config is True and config_name == "lora":
+            config = LORA_CONFIG
+
+        if config is True and config_name == "deepspeed":
+            config = DEEPSPEED_CONFIG
+
+        # in some cases we can get a boolean value, in that case no need to look for args
+        if isinstance(config, bool):
+            config = None
+
+        elif isinstance(config, dict):
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    config[key.replace(f"{config_name}_", "")] = val
+
+        # update by config name
+        else:
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    setattr(config, key.replace(f"{config_name}_", ""), val)
+
+        dst.update({config_name: config})
+
+
+def _get_class_object(class_path: str) -> type:
+    """
+    given a full class name, this function returns the correct class
+
+    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
+
+    :return the wanted class object
+    """
+    module_path, class_name = class_path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _set_model_and_tokenizer(
+    model: Union[str, List[str]],
+    tokenizer: Union[str, List[str]],
+    task: str,
+    framework: str,
+    lora_config: dict,
+    quantization_config: dict,
+    use_cuda: bool,
+    tokenizer_pretrained_config,
+    model_pretrained_config,
+    device_map: str,
+):
+    """
+    get the correct model and tokenizer according to given user inputs
+
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param task: a supported nlp task, used to choose model if not provided
+    :param framework: pt or tf
+    :param lora_config: lora config or None, to load model in appropriate way
+    :param quantization_config: quantization config or None, to load model in appropriate way
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param device_map: a device map for model training if using number of gpu's
+
+    :returns: model and tokenizer
+    """
+    # if task is not supported and no model was given we can't choose one
+    if task and task not in supported_tasks and not model:
+        logger.error("unsupported task option chosen")
+        raise
+
+    # load model from store
+    if isinstance(model, str) and is_store_uri(model):
+        pass
+        # TODO: load both model and tokenizer and return, need guy's help
+
+    # if it's a tuple them we assume it contains of both name and class
+    if isinstance(model, list):
+        model_name, model_class = model
+        model_class = _get_class_object(model_class)
+
+    # in the case we don't get the model class we need the task in order to choose the correct model
+    else:
+        if task is None:
+            logger.error("task must be chosen in order to determine the correct model")
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        _, available_classes, task_options = transformers.pipelines.check_task(task)
+
+        if isinstance(model, str):
+            model_name = model
+
+        # if model is not given, we take the default model for the given task
+        else:
+            model_name, _ = transformers.pipelines.get_default_model_and_revision(
+                available_classes, framework, task_options
+            )
+        if not available_classes.get(framework, tuple()):
+            logger.error(
+                "given task's default model is not supported in specified framework"
+            )
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        model_class = available_classes[framework][0]
+
+    # load the pretrained model
+    if use_cuda:
+        device_map = device_map
+    else:
+        device_map = None
+
+    model = model_class.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        **model_pretrained_config,
+    )
+
+    # If quantization config is given we will load a quantized model, if not a regular one
+    if quantization_config:
+        model.gradient_checkpointing_enable()
+        model = peft.prepare_model_for_kbit_training(model)
+
+    # If lora config was given we want to do lora fine tune, we update model here
+    if lora_config:
+        model = peft.get_peft_model(model, lora_config)
+
+    # if not specified we choose the default tokenizer that corresponding to the model
+    if tokenizer is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+        return model_name, model, tokenizer
+
+    if isinstance(tokenizer, str):
+        tokenizer_name = tokenizer
+        tokenizer_class = transformers.AutoTokenizer
+
+    # if it's not a str then it's a tuple of both name and class
+    else:
+        tokenizer_name, tokenizer_class = tokenizer
+        tokenizer_class = _get_class_object(tokenizer_class)
+
+    tokenizer = tokenizer_class.from_pretrained(
+        tokenizer_name, **tokenizer_pretrained_config
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model_name, model, tokenizer
+
+
+def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
+    """
+    loads the specific dataset provided by the user
+
+    :param dataset: name or path of dataset to load
+    :param is_train: bool that indicates the purpose of the dataset
+    :param kwargs: other kwargs for loading the dataset
+
+    :returns: loaded dataset
+    """
+    # if split in kwargs then the user decides how to split the dataset
+    if "split" in kwargs:
+        return load_dataset(dataset, **kwargs)
+
+    # if it's a dataset for train we split with train
+    if is_train:
+        return load_dataset(dataset, split="train", **kwargs)
+
+    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
+    dataset = load_dataset(dataset, **kwargs)
+    if "test" in dataset:
+        return dataset.get("test")
+    elif "eval" in dataset:
+        return dataset.get("eval")
+    elif "validation" in dataset:
+        return dataset.get("validation")
+
+
+def _prepare_dataset(
+    train_dataset: str,
+    eval_dataset: str,
+    train_load_dataset_kwargs,
+    eval_load_dataset_kwargs,
+    tokenizer,
+    dataset_columns_to_train: Union[str, list],
+) -> (Dataset, Union[Dataset, None]):
+    """
+    Loads the train and eval datasets (if provided) passes them through the tokenizer and
+    returns them ready to use in training
+
+    :param train_dataset: the name or path to the train dataset
+    :param eval_dataset: the name or path to the eval dataset
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+                                        (need to pass through the tokenizer first)
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param tokenizer: the tokenizer to pass the data through
+
+    :returns: tokenized datasets
+    """
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # we take col name/s in a list for easy generalization
+    if isinstance(dataset_columns_to_train, str):
+        dataset_columns_to_train = [dataset_columns_to_train]
+
+    if isinstance(train_dataset, mlrun.datastore.DataItem):
+        train_dataset = Dataset.from_pandas(train_dataset.as_df())
+        return (
+            train_dataset.map(
+                lambda examples: tokenizer(
+                    *[examples[col] for col in dataset_columns_to_train],
+                    truncation=True,
+                    padding=True,
+                ),
+                batched=True,
+            ),
+            None,
+        )
+
+    # Load datasets
+    # if provided two paths/names we load each separately using designated func
+    if eval_dataset:
+        train_dataset = _dataset_loader(
+            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
+        )
+        eval_dataset = _dataset_loader(
+            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
+        )
+
+    # if only on path is given then we must check if it contains both dataset or if only one should be used
+    else:
+        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
+        if "train" in dataset:
+            train_dataset = dataset.get("train")
+            if "test" in dataset:
+                eval_dataset = dataset.get("test")
+            elif "eval" in dataset:
+                eval_dataset = dataset.get("eval")
+            elif "validation" in dataset:
+                eval_dataset = dataset.get("validation")
+            else:
+                # only train dataset given, tokenize and return it
+                return (
+                    train_dataset.map(
+                        lambda examples: tokenizer(
+                            *[examples[col] for col in dataset_columns_to_train],
+                            truncation=True,
+                            padding=True,
+                        ),
+                        batched=True,
+                    ),
+                    None,
+                )
+        else:
+            logger.error("train dataset is mandatory")
+            raise KeyError("no train dataset found in given dataset")
+
+    # Tokenize the data so the model can understand it
+    tokenized_train_dataset = train_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    tokenized_eval_dataset = eval_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    return tokenized_train_dataset, tokenized_eval_dataset
+
+
+def finetune_llm(
+    context: mlrun.MLClientCtx,
+    train_dataset: Union[str, mlrun.datastore.DataItem],
+    eval_dataset: str = None,
+    train_load_dataset_kwargs: dict = {},
+    eval_load_dataset_kwargs: dict = {},
+    dataset_columns_to_train: Union[str, list] = "text",
+    model: Union[str, List[str]] = "huggingface-model",
+    tokenizer: Union[str, List[str]] = None,
+    deepspeed_config: Union[dict, bool] = False,
+    quantization_config: Union[dict, bool] = False,
+    lora_config: Union[dict, bool] = False,
+    training_config: dict = {},
+    model_pretrained_config: dict = {},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
+    task: str = "text-generation",
+    use_cuda: bool = True,
+    framework: str = "pt",
+    device_map: str = "auto",
+    **kwargs,
+):
+    """
+    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+     The function takes various configuration parameters to customize the training process
+     and adapt the model to specific tasks using a provided dataset.
+
+    :param context: mlrun context in order to log trained model
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param framework: pt ot tf
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param train_dataset: The train dataset used for fine-tuning the language model.
+    :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param deepspeed_config: Configuration options for DeepSpeed (optional).
+    :param quantization_config: Configuration options for model quantization (optional).
+    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param data_collator_config: Configuration options for data collation during training (optional).
+    :param task: A description of the specific task the model is being fine-tuned for.
+    :param kwargs: Additional keyword arguments.
+    """
+
+    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
+    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
+
+    # Look for updates to configs given in kwargs
+    configs = {
+        ConfigKeys.deepspeed: deepspeed_config,
+        ConfigKeys.quantization: quantization_config,
+        ConfigKeys.lora: lora_config,
+        ConfigKeys.training: training_config,
+        ConfigKeys.model_pretrained: model_pretrained_config,
+        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
+        ConfigKeys.data_collator: data_collator_config,
+    }
+    _update_config(dst=configs, src=kwargs)
+
+    # check gpu permission and availability
+    if use_cuda:
+        if torch.cuda.is_available():
+            # Clean gpu cache
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("'use_cuda' is set to True, but no cuda device is available")
+
+    # get model and tokenizer
+    model_name, model, tokenizer = _set_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        task=task,
+        framework=framework,
+        lora_config=configs[ConfigKeys.lora],
+        quantization_config=configs[ConfigKeys.quantization],
+        use_cuda=use_cuda,
+        tokenizer_pretrained_config=tokenizer_pretrained_config,
+        model_pretrained_config=configs[ConfigKeys.model_pretrained],
+        device_map=device_map,
+    )
+
+    # Load datasets
+    tokenized_train, tokenized_eval = _prepare_dataset(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        train_load_dataset_kwargs=train_load_dataset_kwargs,
+        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+        tokenizer=tokenizer,
+        dataset_columns_to_train=dataset_columns_to_train,
+    )
+
+    # Initialize the data collator for the trainer to use in order to create batches of data
+    data_collator = transformers.DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False, **data_collator_config
+    )
+
+    # Initialize training kwargs from user kwargs:
+    train_kwargs = configs[ConfigKeys.training]
+
+    # If deepspeed config given we add it to training kwargs
+    if configs[ConfigKeys.deepspeed]:
+        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
+
+    # Take a look at the trainable parameters in the model
+    _print_trainable_parameters(model)
+
+    # Preparing training arguments:
+    training_args = transformers.TrainingArguments(
+        output_dir=tempfile.mkdtemp(),
+        **train_kwargs,
+    )
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=training_args,
+    )
+
+    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
+    model.config.use_cache = (
+        False  # silence the warnings. Please re-enable for inference!
+    )
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+    temp_directory = tempfile.TemporaryDirectory().name
+    trainer.save_model(temp_directory)
+
+    # Zip the model directory:
+    shutil.make_archive(
+        base_name="model",
+        format="zip",
+        root_dir=temp_directory,
+    )
+
+    # Log the model:
+    context.log_model(
+        key="model",
+        db_key=model_name.split("/")[-1],
+        model_file="model.zip",
+        tag="",
+        framework="Hugging Face",
+    )
+
+
+def evaluate(
+    context,
+    model_path,
+    data: pd.DataFrame,
+    model_name: str = None,
+    tokenizer_name: str = None,
+):
+    """
+    Evaluating the model using perplexity, for more information visit:
+    https://huggingface.co/docs/transformers/perplexity
+
+    :param context:     mlrun context
+    :param model_path:  path to the model directory
+    :param data:        the data to evaluate the model
+    :param model_name:  name of base model
+    :param tokenizer_name: name of base tokenizer
+    """
+    # Get the model artifact and file:
+    (
+        model_file,
+        model_artifact,
+        extra_data,
+    ) = mlrun.artifacts.get_model(model_path)
+
+    # Read the name:
+    _model_name = model_artifact.spec.db_key
+
+    # Extract logged model files:
+    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_directory)
+
+    # Loading the saved pretrained tokenizer and model:
+    dataset = Dataset.from_pandas(data)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
+    )
+    model = PeftModel.from_pretrained(model, model_directory)
+    model.eval()
+    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
+
+    max_length = 1024
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids.cuda(), labels=target_ids)
+
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
+            neg_log_likelihood = outputs.loss
+
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    ppl = torch.exp(torch.stack(nlls).mean()).item()
+    context.log_result("perplexity", ppl)
diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
new file mode 100644
index 000000000..d1166318c
--- /dev/null
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -0,0 +1,855 @@
+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+class ConfigKeys:
+    deepspeed = "deepspeed"
+    quantization = "quantization"
+    lora = "lora"
+    training = "training"
+    tokenizer_pretrained = "tokenizer_pretrained"
+    model_pretrained = "model_pretrained"
+    data_collator = "data_collator"
+
+
+# ----------------------from MLRUN--------------------------------
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: Trainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+        def wrapper(self: Trainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+        if self._is_training:
+            return
+
+    def log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self.log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def apply_mlrun(
+    trainer: transformers.Trainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    """
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+# ----------------------end from MLRUN--------------------------------
+
+
+def _print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
+        f" {100 * trainable_params / all_param}"
+    )
+
+
+# default configs
+# will be used if user provides "True" with config name as input
+QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+LORA_CONFIG = peft.LoraConfig(
+    r=8,
+    lora_alpha=32,
+    target_modules=["query_key_value"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+DEEPSPEED_CONFIG = {
+    "train_micro_batch_size_per_gpu": "auto",
+    "fp16": {"enabled": True},
+    "autotuning": {
+        "enabled": True,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
+        },
+    },
+    "zero_optimization": {
+        "stage": 2,
+    },
+}
+
+
+def _update_config(src: dict, dst: dict):
+    """
+    update configs according to user, this way the user can add/modify values in default configs for e.g.
+
+    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
+     with the prefix and add them to appropriate config
+
+    :param src: dict of all candidate values to update dict.
+    :param dst: dict containing all configs to update.
+    """
+
+    for config_name, config in dst.items():
+
+        # If given True we use default dict
+        # Can also be False or a config dict given from user, so we check specifically fo True
+        if config is True and config_name == "quantization":
+            config = QUANTIZATION_CONFIG
+
+        if config is True and config_name == "lora":
+            config = LORA_CONFIG
+
+        if config is True and config_name == "deepspeed":
+            config = DEEPSPEED_CONFIG
+
+        # in some cases we can get a boolean value, in that case no need to look for args
+        if isinstance(config, bool):
+            config = None
+
+        elif isinstance(config, dict):
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    config[key.replace(f"{config_name}_", "")] = val
+
+        # update by config name
+        else:
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    setattr(config, key.replace(f"{config_name}_", ""), val)
+
+        dst.update({config_name: config})
+
+
+def _get_class_object(class_path: str) -> type:
+    """
+    given a full class name, this function returns the correct class
+
+    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
+
+    :return the wanted class object
+    """
+    module_path, class_name = class_path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _set_model_and_tokenizer(
+    model: Union[str, List[str]],
+    tokenizer: Union[str, List[str]],
+    task: str,
+    framework: str,
+    lora_config: dict,
+    quantization_config: dict,
+    use_cuda: bool,
+    tokenizer_pretrained_config,
+    model_pretrained_config,
+    device_map: str,
+):
+    """
+    get the correct model and tokenizer according to given user inputs
+
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param task: a supported nlp task, used to choose model if not provided
+    :param framework: pt or tf
+    :param lora_config: lora config or None, to load model in appropriate way
+    :param quantization_config: quantization config or None, to load model in appropriate way
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param device_map: a device map for model training if using number of gpu's
+
+    :returns: model and tokenizer
+    """
+    # if task is not supported and no model was given we can't choose one
+    if task and task not in supported_tasks and not model:
+        logger.error("unsupported task option chosen")
+        raise
+
+    # load model from store
+    if isinstance(model, str) and is_store_uri(model):
+        pass
+        # TODO: load both model and tokenizer and return, need guy's help
+
+    # if it's a tuple them we assume it contains of both name and class
+    if isinstance(model, list):
+        model_name, model_class = model
+        model_class = _get_class_object(model_class)
+
+    # in the case we don't get the model class we need the task in order to choose the correct model
+    else:
+        if task is None:
+            logger.error("task must be chosen in order to determine the correct model")
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        _, available_classes, task_options = transformers.pipelines.check_task(task)
+
+        if isinstance(model, str):
+            model_name = model
+
+        # if model is not given, we take the default model for the given task
+        else:
+            model_name, _ = transformers.pipelines.get_default_model_and_revision(
+                available_classes, framework, task_options
+            )
+        if not available_classes.get(framework, tuple()):
+            logger.error(
+                "given task's default model is not supported in specified framework"
+            )
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        model_class = available_classes[framework][0]
+
+    # load the pretrained model
+    if use_cuda:
+        device_map = device_map
+    else:
+        device_map = None
+
+    model = model_class.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        **model_pretrained_config,
+    )
+
+    # If quantization config is given we will load a quantized model, if not a regular one
+    if quantization_config:
+        model.gradient_checkpointing_enable()
+        model = peft.prepare_model_for_kbit_training(model)
+
+    # If lora config was given we want to do lora fine tune, we update model here
+    if lora_config:
+        model = peft.get_peft_model(model, lora_config)
+
+    # if not specified we choose the default tokenizer that corresponding to the model
+    if tokenizer is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+        return model_name, model, tokenizer
+
+    if isinstance(tokenizer, str):
+        tokenizer_name = tokenizer
+        tokenizer_class = transformers.AutoTokenizer
+
+    # if it's not a str then it's a tuple of both name and class
+    else:
+        tokenizer_name, tokenizer_class = tokenizer
+        tokenizer_class = _get_class_object(tokenizer_class)
+
+    tokenizer = tokenizer_class.from_pretrained(
+        tokenizer_name, **tokenizer_pretrained_config
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model_name, model, tokenizer
+
+
+def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
+    """
+    loads the specific dataset provided by the user
+
+    :param dataset: name or path of dataset to load
+    :param is_train: bool that indicates the purpose of the dataset
+    :param kwargs: other kwargs for loading the dataset
+
+    :returns: loaded dataset
+    """
+    # if split in kwargs then the user decides how to split the dataset
+    if "split" in kwargs:
+        return load_dataset(dataset, **kwargs)
+
+    # if it's a dataset for train we split with train
+    if is_train:
+        return load_dataset(dataset, split="train", **kwargs)
+
+    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
+    dataset = load_dataset(dataset, **kwargs)
+    if "test" in dataset:
+        return dataset.get("test")
+    elif "eval" in dataset:
+        return dataset.get("eval")
+    elif "validation" in dataset:
+        return dataset.get("validation")
+
+
+def _prepare_dataset(
+    train_dataset: str,
+    eval_dataset: str,
+    train_load_dataset_kwargs,
+    eval_load_dataset_kwargs,
+    tokenizer,
+    dataset_columns_to_train: Union[str, list],
+) -> (Dataset, Union[Dataset, None]):
+    """
+    Loads the train and eval datasets (if provided) passes them through the tokenizer and
+    returns them ready to use in training
+
+    :param train_dataset: the name or path to the train dataset
+    :param eval_dataset: the name or path to the eval dataset
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+                                        (need to pass through the tokenizer first)
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param tokenizer: the tokenizer to pass the data through
+
+    :returns: tokenized datasets
+    """
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # we take col name/s in a list for easy generalization
+    if isinstance(dataset_columns_to_train, str):
+        dataset_columns_to_train = [dataset_columns_to_train]
+
+    if isinstance(train_dataset, mlrun.datastore.DataItem):
+        train_dataset = Dataset.from_pandas(train_dataset.as_df())
+        return (
+            train_dataset.map(
+                lambda examples: tokenizer(
+                    *[examples[col] for col in dataset_columns_to_train],
+                    truncation=True,
+                    padding=True,
+                ),
+                batched=True,
+            ),
+            None,
+        )
+
+    # Load datasets
+    # if provided two paths/names we load each separately using designated func
+    if eval_dataset:
+        train_dataset = _dataset_loader(
+            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
+        )
+        eval_dataset = _dataset_loader(
+            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
+        )
+
+    # if only on path is given then we must check if it contains both dataset or if only one should be used
+    else:
+        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
+        if "train" in dataset:
+            train_dataset = dataset.get("train")
+            if "test" in dataset:
+                eval_dataset = dataset.get("test")
+            elif "eval" in dataset:
+                eval_dataset = dataset.get("eval")
+            elif "validation" in dataset:
+                eval_dataset = dataset.get("validation")
+            else:
+                # only train dataset given, tokenize and return it
+                return (
+                    train_dataset.map(
+                        lambda examples: tokenizer(
+                            *[examples[col] for col in dataset_columns_to_train],
+                            truncation=True,
+                            padding=True,
+                        ),
+                        batched=True,
+                    ),
+                    None,
+                )
+        else:
+            logger.error("train dataset is mandatory")
+            raise KeyError("no train dataset found in given dataset")
+
+    # Tokenize the data so the model can understand it
+    tokenized_train_dataset = train_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    tokenized_eval_dataset = eval_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    return tokenized_train_dataset, tokenized_eval_dataset
+
+
+def finetune_llm(
+    context: mlrun.MLClientCtx,
+    train_dataset: Union[str, mlrun.datastore.DataItem],
+    eval_dataset: str = None,
+    train_load_dataset_kwargs: dict = {},
+    eval_load_dataset_kwargs: dict = {},
+    dataset_columns_to_train: Union[str, list] = "text",
+    model: Union[str, List[str]] = "huggingface-model",
+    tokenizer: Union[str, List[str]] = None,
+    deepspeed_config: Union[dict, bool] = False,
+    quantization_config: Union[dict, bool] = False,
+    lora_config: Union[dict, bool] = False,
+    training_config: dict = {},
+    model_pretrained_config: dict = {},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
+    task: str = "text-generation",
+    use_cuda: bool = True,
+    framework: str = "pt",
+    device_map: str = "auto",
+    **kwargs,
+):
+    """
+    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+     The function takes various configuration parameters to customize the training process
+     and adapt the model to specific tasks using a provided dataset.
+
+    :param context: mlrun context in order to log trained model
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param framework: pt ot tf
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param train_dataset: The train dataset used for fine-tuning the language model.
+    :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param deepspeed_config: Configuration options for DeepSpeed (optional).
+    :param quantization_config: Configuration options for model quantization (optional).
+    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param data_collator_config: Configuration options for data collation during training (optional).
+    :param task: A description of the specific task the model is being fine-tuned for.
+    :param kwargs: Additional keyword arguments.
+    """
+
+    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
+    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
+
+    # Look for updates to configs given in kwargs
+    configs = {
+        ConfigKeys.deepspeed: deepspeed_config,
+        ConfigKeys.quantization: quantization_config,
+        ConfigKeys.lora: lora_config,
+        ConfigKeys.training: training_config,
+        ConfigKeys.model_pretrained: model_pretrained_config,
+        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
+        ConfigKeys.data_collator: data_collator_config,
+    }
+    _update_config(dst=configs, src=kwargs)
+
+    # check gpu permission and availability
+    if use_cuda:
+        if torch.cuda.is_available():
+            # Clean gpu cache
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("'use_cuda' is set to True, but no cuda device is available")
+
+    # get model and tokenizer
+    model_name, model, tokenizer = _set_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        task=task,
+        framework=framework,
+        lora_config=configs[ConfigKeys.lora],
+        quantization_config=configs[ConfigKeys.quantization],
+        use_cuda=use_cuda,
+        tokenizer_pretrained_config=tokenizer_pretrained_config,
+        model_pretrained_config=configs[ConfigKeys.model_pretrained],
+        device_map=device_map,
+    )
+
+    # Load datasets
+    tokenized_train, tokenized_eval = _prepare_dataset(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        train_load_dataset_kwargs=train_load_dataset_kwargs,
+        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+        tokenizer=tokenizer,
+        dataset_columns_to_train=dataset_columns_to_train,
+    )
+
+    # Initialize the data collator for the trainer to use in order to create batches of data
+    data_collator = transformers.DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False, **data_collator_config
+    )
+
+    # Initialize training kwargs from user kwargs:
+    train_kwargs = configs[ConfigKeys.training]
+
+    # If deepspeed config given we add it to training kwargs
+    if configs[ConfigKeys.deepspeed]:
+        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
+
+    # Take a look at the trainable parameters in the model
+    _print_trainable_parameters(model)
+
+    # Preparing training arguments:
+    training_args = transformers.TrainingArguments(
+        output_dir=tempfile.mkdtemp(),
+        **train_kwargs,
+    )
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=training_args,
+    )
+
+    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
+    model.config.use_cache = (
+        False  # silence the warnings. Please re-enable for inference!
+    )
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+    temp_directory = tempfile.TemporaryDirectory().name
+    trainer.save_model(temp_directory)
+
+    # Zip the model directory:
+    shutil.make_archive(
+        base_name="model",
+        format="zip",
+        root_dir=temp_directory,
+    )
+
+    # Log the model:
+    context.log_model(
+        key="model",
+        db_key=model_name.split("/")[-1],
+        model_file="model.zip",
+        tag="",
+        framework="Hugging Face",
+    )
+
+
+def evaluate(
+    context,
+    model_path,
+    data: pd.DataFrame,
+    model_name: str = None,
+    tokenizer_name: str = None,
+):
+    """
+    Evaluating the model using perplexity, for more information visit:
+    https://huggingface.co/docs/transformers/perplexity
+
+    :param context:     mlrun context
+    :param model_path:  path to the model directory
+    :param data:        the data to evaluate the model
+    :param model_name:  name of base model
+    :param tokenizer_name: name of base tokenizer
+    """
+    # Get the model artifact and file:
+    (
+        model_file,
+        model_artifact,
+        extra_data,
+    ) = mlrun.artifacts.get_model(model_path)
+
+    # Read the name:
+    _model_name = model_artifact.spec.db_key
+
+    # Extract logged model files:
+    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_directory)
+
+    # Loading the saved pretrained tokenizer and model:
+    dataset = Dataset.from_pandas(data)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
+    )
+    model = PeftModel.from_pretrained(model, model_directory)
+    model.eval()
+    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
+
+    max_length = 1024
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids.cuda(), labels=target_ids)
+
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
+            neg_log_likelihood = outputs.loss
+
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    ppl = torch.exp(torch.stack(nlls).mean()).item()
+    context.log_result("perplexity", ppl)

From cca1e7ee28c169171b39f50095ed03f43ffd390c Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 6 Mar 2024 12:56:13 -0800
Subject: [PATCH 02/33] adding the dpo from trl

---
 huggingface_dpo/huggingface_auto_trainer.py | 855 --------------------
 1 file changed, 855 deletions(-)
 delete mode 100644 huggingface_dpo/huggingface_auto_trainer.py

diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py
deleted file mode 100644
index d1166318c..000000000
--- a/huggingface_dpo/huggingface_auto_trainer.py
+++ /dev/null
@@ -1,855 +0,0 @@
-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-class ConfigKeys:
-    deepspeed = "deepspeed"
-    quantization = "quantization"
-    lora = "lora"
-    training = "training"
-    tokenizer_pretrained = "tokenizer_pretrained"
-    model_pretrained = "model_pretrained"
-    data_collator = "data_collator"
-
-
-# ----------------------from MLRUN--------------------------------
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-        if self._is_training:
-            return
-
-    def log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self.log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def apply_mlrun(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    """
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-# ----------------------end from MLRUN--------------------------------
-
-
-def _print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
-        f" {100 * trainable_params / all_param}"
-    )
-
-
-# default configs
-# will be used if user provides "True" with config name as input
-QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-
-LORA_CONFIG = peft.LoraConfig(
-    r=8,
-    lora_alpha=32,
-    target_modules=["query_key_value"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-DEEPSPEED_CONFIG = {
-    "train_micro_batch_size_per_gpu": "auto",
-    "fp16": {"enabled": True},
-    "autotuning": {
-        "enabled": True,
-        "arg_mappings": {
-            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
-        },
-    },
-    "zero_optimization": {
-        "stage": 2,
-    },
-}
-
-
-def _update_config(src: dict, dst: dict):
-    """
-    update configs according to user, this way the user can add/modify values in default configs for e.g.
-
-    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
-     with the prefix and add them to appropriate config
-
-    :param src: dict of all candidate values to update dict.
-    :param dst: dict containing all configs to update.
-    """
-
-    for config_name, config in dst.items():
-
-        # If given True we use default dict
-        # Can also be False or a config dict given from user, so we check specifically fo True
-        if config is True and config_name == "quantization":
-            config = QUANTIZATION_CONFIG
-
-        if config is True and config_name == "lora":
-            config = LORA_CONFIG
-
-        if config is True and config_name == "deepspeed":
-            config = DEEPSPEED_CONFIG
-
-        # in some cases we can get a boolean value, in that case no need to look for args
-        if isinstance(config, bool):
-            config = None
-
-        elif isinstance(config, dict):
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    config[key.replace(f"{config_name}_", "")] = val
-
-        # update by config name
-        else:
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    setattr(config, key.replace(f"{config_name}_", ""), val)
-
-        dst.update({config_name: config})
-
-
-def _get_class_object(class_path: str) -> type:
-    """
-    given a full class name, this function returns the correct class
-
-    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
-
-    :return the wanted class object
-    """
-    module_path, class_name = class_path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-
-
-def _set_model_and_tokenizer(
-    model: Union[str, List[str]],
-    tokenizer: Union[str, List[str]],
-    task: str,
-    framework: str,
-    lora_config: dict,
-    quantization_config: dict,
-    use_cuda: bool,
-    tokenizer_pretrained_config,
-    model_pretrained_config,
-    device_map: str,
-):
-    """
-    get the correct model and tokenizer according to given user inputs
-
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param task: a supported nlp task, used to choose model if not provided
-    :param framework: pt or tf
-    :param lora_config: lora config or None, to load model in appropriate way
-    :param quantization_config: quantization config or None, to load model in appropriate way
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param device_map: a device map for model training if using number of gpu's
-
-    :returns: model and tokenizer
-    """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
-    # load model from store
-    if isinstance(model, str) and is_store_uri(model):
-        pass
-        # TODO: load both model and tokenizer and return, need guy's help
-
-    # if it's a tuple them we assume it contains of both name and class
-    if isinstance(model, list):
-        model_name, model_class = model
-        model_class = _get_class_object(model_class)
-
-    # in the case we don't get the model class we need the task in order to choose the correct model
-    else:
-        if task is None:
-            logger.error("task must be chosen in order to determine the correct model")
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        _, available_classes, task_options = transformers.pipelines.check_task(task)
-
-        if isinstance(model, str):
-            model_name = model
-
-        # if model is not given, we take the default model for the given task
-        else:
-            model_name, _ = transformers.pipelines.get_default_model_and_revision(
-                available_classes, framework, task_options
-            )
-        if not available_classes.get(framework, tuple()):
-            logger.error(
-                "given task's default model is not supported in specified framework"
-            )
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        model_class = available_classes[framework][0]
-
-    # load the pretrained model
-    if use_cuda:
-        device_map = device_map
-    else:
-        device_map = None
-
-    model = model_class.from_pretrained(
-        model_name,
-        quantization_config=quantization_config,
-        device_map=device_map,
-        **model_pretrained_config,
-    )
-
-    # If quantization config is given we will load a quantized model, if not a regular one
-    if quantization_config:
-        model.gradient_checkpointing_enable()
-        model = peft.prepare_model_for_kbit_training(model)
-
-    # If lora config was given we want to do lora fine tune, we update model here
-    if lora_config:
-        model = peft.get_peft_model(model, lora_config)
-
-    # if not specified we choose the default tokenizer that corresponding to the model
-    if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-        return model_name, model, tokenizer
-
-    if isinstance(tokenizer, str):
-        tokenizer_name = tokenizer
-        tokenizer_class = transformers.AutoTokenizer
-
-    # if it's not a str then it's a tuple of both name and class
-    else:
-        tokenizer_name, tokenizer_class = tokenizer
-        tokenizer_class = _get_class_object(tokenizer_class)
-
-    tokenizer = tokenizer_class.from_pretrained(
-        tokenizer_name, **tokenizer_pretrained_config
-    )
-
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model_name, model, tokenizer
-
-
-def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
-    """
-    loads the specific dataset provided by the user
-
-    :param dataset: name or path of dataset to load
-    :param is_train: bool that indicates the purpose of the dataset
-    :param kwargs: other kwargs for loading the dataset
-
-    :returns: loaded dataset
-    """
-    # if split in kwargs then the user decides how to split the dataset
-    if "split" in kwargs:
-        return load_dataset(dataset, **kwargs)
-
-    # if it's a dataset for train we split with train
-    if is_train:
-        return load_dataset(dataset, split="train", **kwargs)
-
-    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
-    dataset = load_dataset(dataset, **kwargs)
-    if "test" in dataset:
-        return dataset.get("test")
-    elif "eval" in dataset:
-        return dataset.get("eval")
-    elif "validation" in dataset:
-        return dataset.get("validation")
-
-
-def _prepare_dataset(
-    train_dataset: str,
-    eval_dataset: str,
-    train_load_dataset_kwargs,
-    eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
-) -> (Dataset, Union[Dataset, None]):
-    """
-    Loads the train and eval datasets (if provided) passes them through the tokenizer and
-    returns them ready to use in training
-
-    :param train_dataset: the name or path to the train dataset
-    :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
-
-    :returns: tokenized datasets
-    """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
-
-    # Load datasets
-    # if provided two paths/names we load each separately using designated func
-    if eval_dataset:
-        train_dataset = _dataset_loader(
-            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
-        )
-        eval_dataset = _dataset_loader(
-            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
-        )
-
-    # if only on path is given then we must check if it contains both dataset or if only one should be used
-    else:
-        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
-        if "train" in dataset:
-            train_dataset = dataset.get("train")
-            if "test" in dataset:
-                eval_dataset = dataset.get("test")
-            elif "eval" in dataset:
-                eval_dataset = dataset.get("eval")
-            elif "validation" in dataset:
-                eval_dataset = dataset.get("validation")
-            else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
-        else:
-            logger.error("train dataset is mandatory")
-            raise KeyError("no train dataset found in given dataset")
-
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
-
-
-def finetune_llm(
-    context: mlrun.MLClientCtx,
-    train_dataset: Union[str, mlrun.datastore.DataItem],
-    eval_dataset: str = None,
-    train_load_dataset_kwargs: dict = {},
-    eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
-    model: Union[str, List[str]] = "huggingface-model",
-    tokenizer: Union[str, List[str]] = None,
-    deepspeed_config: Union[dict, bool] = False,
-    quantization_config: Union[dict, bool] = False,
-    lora_config: Union[dict, bool] = False,
-    training_config: dict = {},
-    model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
-    task: str = "text-generation",
-    use_cuda: bool = True,
-    framework: str = "pt",
-    device_map: str = "auto",
-    **kwargs,
-):
-    """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
-     The function takes various configuration parameters to customize the training process
-     and adapt the model to specific tasks using a provided dataset.
-
-    :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param train_dataset: The train dataset used for fine-tuning the language model.
-    :param eval_dataset: The eval dataset used for evaluate the language model during training.
-    :param deepspeed_config: Configuration options for DeepSpeed (optional).
-    :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
-    :param training_config: Configuration options specific to the fine-tuning training process (optional).
-    :param data_collator_config: Configuration options for data collation during training (optional).
-    :param task: A description of the specific task the model is being fine-tuned for.
-    :param kwargs: Additional keyword arguments.
-    """
-
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
-    # Look for updates to configs given in kwargs
-    configs = {
-        ConfigKeys.deepspeed: deepspeed_config,
-        ConfigKeys.quantization: quantization_config,
-        ConfigKeys.lora: lora_config,
-        ConfigKeys.training: training_config,
-        ConfigKeys.model_pretrained: model_pretrained_config,
-        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
-        ConfigKeys.data_collator: data_collator_config,
-    }
-    _update_config(dst=configs, src=kwargs)
-
-    # check gpu permission and availability
-    if use_cuda:
-        if torch.cuda.is_available():
-            # Clean gpu cache
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("'use_cuda' is set to True, but no cuda device is available")
-
-    # get model and tokenizer
-    model_name, model, tokenizer = _set_model_and_tokenizer(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        framework=framework,
-        lora_config=configs[ConfigKeys.lora],
-        quantization_config=configs[ConfigKeys.quantization],
-        use_cuda=use_cuda,
-        tokenizer_pretrained_config=tokenizer_pretrained_config,
-        model_pretrained_config=configs[ConfigKeys.model_pretrained],
-        device_map=device_map,
-    )
-
-    # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
-
-    # Initialize training kwargs from user kwargs:
-    train_kwargs = configs[ConfigKeys.training]
-
-    # If deepspeed config given we add it to training kwargs
-    if configs[ConfigKeys.deepspeed]:
-        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
-
-    # Take a look at the trainable parameters in the model
-    _print_trainable_parameters(model)
-
-    # Preparing training arguments:
-    training_args = transformers.TrainingArguments(
-        output_dir=tempfile.mkdtemp(),
-        **train_kwargs,
-    )
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        args=training_args,
-    )
-
-    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-    temp_directory = tempfile.TemporaryDirectory().name
-    trainer.save_model(temp_directory)
-
-    # Zip the model directory:
-    shutil.make_archive(
-        base_name="model",
-        format="zip",
-        root_dir=temp_directory,
-    )
-
-    # Log the model:
-    context.log_model(
-        key="model",
-        db_key=model_name.split("/")[-1],
-        model_file="model.zip",
-        tag="",
-        framework="Hugging Face",
-    )
-
-
-def evaluate(
-    context,
-    model_path,
-    data: pd.DataFrame,
-    model_name: str = None,
-    tokenizer_name: str = None,
-):
-    """
-    Evaluating the model using perplexity, for more information visit:
-    https://huggingface.co/docs/transformers/perplexity
-
-    :param context:     mlrun context
-    :param model_path:  path to the model directory
-    :param data:        the data to evaluate the model
-    :param model_name:  name of base model
-    :param tokenizer_name: name of base tokenizer
-    """
-    # Get the model artifact and file:
-    (
-        model_file,
-        model_artifact,
-        extra_data,
-    ) = mlrun.artifacts.get_model(model_path)
-
-    # Read the name:
-    _model_name = model_artifact.spec.db_key
-
-    # Extract logged model files:
-    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_directory)
-
-    # Loading the saved pretrained tokenizer and model:
-    dataset = Dataset.from_pandas(data)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
-    )
-    model = PeftModel.from_pretrained(model, model_directory)
-    model.eval()
-    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
-
-    max_length = 1024
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-
-        with torch.no_grad():
-            outputs = model(input_ids.cuda(), labels=target_ids)
-
-            # loss is calculated using CrossEntropyLoss which averages over valid labels
-            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-            # to the left by 1.
-            neg_log_likelihood = outputs.loss
-
-        nlls.append(neg_log_likelihood)
-
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-
-    ppl = torch.exp(torch.stack(nlls).mean()).item()
-    context.log_result("perplexity", ppl)

From 01c1d08bd35d449db3d56e5fc28632426cd998b8 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Sun, 17 Mar 2024 19:05:45 -0700
Subject: [PATCH 03/33] should use dpo_trainer for dpo training

---
 huggingface_dpo/huggingface_dpo.py      |  1 +
 huggingface_dpo/test_huggingface_dpo.py | 42 +++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 huggingface_dpo/test_huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
index d1166318c..bf2ed3cf0 100644
--- a/huggingface_dpo/huggingface_dpo.py
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -17,6 +17,7 @@
 from mlrun.datastore import is_store_uri
 from mlrun.frameworks._common import CommonTypes, MLRunInterface
 from mlrun.utils import logger
+from trl import DPOTrainer
 from peft import (LoraConfig, PeftModel, get_peft_model,
                   prepare_model_for_kbit_training)
 from plotly import graph_objects as go
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
new file mode 100644
index 000000000..53576e4e7
--- /dev/null
+++ b/huggingface_dpo/test_huggingface_dpo.py
@@ -0,0 +1,42 @@
+import tempfile
+
+import mlrun
+
+
+def test_train():
+
+    model_name = "distilgpt2"
+    tokenizer = model_name
+    auto_trainer = mlrun.import_function("function.yaml")
+
+    training_arguments = {
+        "per_device_train_batch_size": 4,
+        "gradient_accumulation_steps": 1,
+        "warmup_steps": 2,
+        "max_steps": 10,
+        "learning_rate": 2e-4,
+        "logging_steps": 1,
+    }
+
+    params = {
+        "model": (model_name, "transformers.AutoModelForCausalLM"),
+        "tokenizer": tokenizer,
+        "train_dataset": "Abirate/english_quotes",
+        "training_config": training_arguments,
+        "dataset_columns_to_train": "quote",
+        "model_pretrained_config": {"use_cache": False},
+        "use_cuda": False,
+    }
+
+    try:
+        with tempfile.TemporaryDirectory() as test_directory:
+            auto_trainer.run(
+                local=True,
+                params=params,
+                handler="finetune_llm",
+                returns=["model"],
+                workdir=test_directory,
+            )
+
+    except Exception as exception:
+        print(f"- The training failed - raised the following error:\n- {exception}")

From c691afcb446c079426051fce3ca7e7d45bd12809 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 18 Mar 2024 02:20:16 +0000
Subject: [PATCH 04/33] adding the req

---
 huggingface_dpo/requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 huggingface_dpo/requirements.txt

diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt
new file mode 100644
index 000000000..1376b1d00
--- /dev/null
+++ b/huggingface_dpo/requirements.txt
@@ -0,0 +1,5 @@
+peft
+transformers
+torch
+datasets
+plotly

From c010d6d6bf0bbeaa6ae24ba25b5037e2ec4486c3 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 18 Mar 2024 02:22:28 +0000
Subject: [PATCH 05/33] using the dpo trainer

---
 huggingface_dpo/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt
index 1376b1d00..a86a25fb4 100644
--- a/huggingface_dpo/requirements.txt
+++ b/huggingface_dpo/requirements.txt
@@ -3,3 +3,4 @@ transformers
 torch
 datasets
 plotly
+trl

From 95b5ce53b58fde34e9ed232ab1cb65b3ecc3f58b Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 18 Mar 2024 02:53:00 +0000
Subject: [PATCH 06/33] adding the mlrun

---
 huggingface_dpo/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt
index a86a25fb4..215b90562 100644
--- a/huggingface_dpo/requirements.txt
+++ b/huggingface_dpo/requirements.txt
@@ -4,3 +4,4 @@ torch
 datasets
 plotly
 trl
+mlrun

From 49159194b520be1514caba7e73f824638254888c Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Sun, 17 Mar 2024 21:10:07 -0700
Subject: [PATCH 07/33] adding the dpo trainer

---
 huggingface_dpo/huggingface_dpo.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
index bf2ed3cf0..a8c46b768 100644
--- a/huggingface_dpo/huggingface_dpo.py
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -44,11 +44,12 @@
 class ConfigKeys:
     deepspeed = "deepspeed"
     quantization = "quantization"
-    lora = "lora"
     training = "training"
     tokenizer_pretrained = "tokenizer_pretrained"
     model_pretrained = "model_pretrained"
+    peft_config = "peft_config"
     data_collator = "data_collator"
+    beta = "beta"
 
 
 # ----------------------from MLRUN--------------------------------
@@ -70,7 +71,7 @@ class HFTrainerMLRunInterface(MLRunInterface, ABC):
     @classmethod
     def add_interface(
         cls,
-        obj: Trainer,
+        obj: DPOTrainer,
         restoration: CommonTypes.MLRunInterfaceRestorationType = None,
     ):
         super(HFTrainerMLRunInterface, cls).add_interface(
@@ -79,7 +80,7 @@ def add_interface(
 
     @classmethod
     def mlrun_train(cls):
-        def wrapper(self: Trainer, *args, **kwargs):
+        def wrapper(self: DPOTrainer, *args, **kwargs):
             # Restore the evaluation method as `train` will use it:
             # cls._restore_attribute(obj=self, attribute_name="evaluate")
 
@@ -386,7 +387,6 @@ def _set_model_and_tokenizer(
     tokenizer: Union[str, List[str]],
     task: str,
     framework: str,
-    lora_config: dict,
     quantization_config: dict,
     use_cuda: bool,
     tokenizer_pretrained_config,
@@ -400,7 +400,6 @@ def _set_model_and_tokenizer(
     :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
     :param task: a supported nlp task, used to choose model if not provided
     :param framework: pt or tf
-    :param lora_config: lora config or None, to load model in appropriate way
     :param quantization_config: quantization config or None, to load model in appropriate way
     :param use_cuda: use gpu or not
     :param tokenizer_pretrained_config: config to load the pretrained tokenizer
@@ -470,10 +469,6 @@ def _set_model_and_tokenizer(
         model.gradient_checkpointing_enable()
         model = peft.prepare_model_for_kbit_training(model)
 
-    # If lora config was given we want to do lora fine tune, we update model here
-    if lora_config:
-        model = peft.get_peft_model(model, lora_config)
-
     # if not specified we choose the default tokenizer that corresponding to the model
     if tokenizer is None:
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
@@ -639,7 +634,8 @@ def finetune_llm(
     tokenizer: Union[str, List[str]] = None,
     deepspeed_config: Union[dict, bool] = False,
     quantization_config: Union[dict, bool] = False,
-    lora_config: Union[dict, bool] = False,
+    peft_config: Union[dict, bool] = False,
+    beta: Union[float, bool] = False,
     training_config: dict = {},
     model_pretrained_config: dict = {},
     tokenizer_pretrained_config: dict = {},
@@ -683,11 +679,12 @@ def finetune_llm(
     configs = {
         ConfigKeys.deepspeed: deepspeed_config,
         ConfigKeys.quantization: quantization_config,
-        ConfigKeys.lora: lora_config,
         ConfigKeys.training: training_config,
         ConfigKeys.model_pretrained: model_pretrained_config,
         ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
         ConfigKeys.data_collator: data_collator_config,
+        ConfigKeys.peft_config: peft_config,
+        ConfigKeys.beta: beta,
     }
     _update_config(dst=configs, src=kwargs)
 
@@ -705,7 +702,6 @@ def finetune_llm(
         tokenizer=tokenizer,
         task=task,
         framework=framework,
-        lora_config=configs[ConfigKeys.lora],
         quantization_config=configs[ConfigKeys.quantization],
         use_cuda=use_cuda,
         tokenizer_pretrained_config=tokenizer_pretrained_config,
@@ -744,10 +740,13 @@ def finetune_llm(
         **train_kwargs,
     )
 
-    trainer = transformers.Trainer(
+    trainer = trl.DPOTrainer(
         model=model,
+        ref_model = None,
         train_dataset=tokenized_train,
         eval_dataset=tokenized_eval,
+        peft_config=configs[ConfigKeys.peft_config],
+        beta = configs[ConfigKeys.beta],
         tokenizer=tokenizer,
         data_collator=data_collator,
         args=training_args,

From 96c08f4d5fbf44f65e0efc1bd3ac2d6d73253ae9 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Sun, 17 Mar 2024 21:13:01 -0700
Subject: [PATCH 08/33] add dpo trainer

---
 huggingface_dpo/huggingface_dpo.py | 855 -----------------------------
 1 file changed, 855 deletions(-)
 delete mode 100644 huggingface_dpo/huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
deleted file mode 100644
index a8c46b768..000000000
--- a/huggingface_dpo/huggingface_dpo.py
+++ /dev/null
@@ -1,855 +0,0 @@
-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from trl import DPOTrainer
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-class ConfigKeys:
-    deepspeed = "deepspeed"
-    quantization = "quantization"
-    training = "training"
-    tokenizer_pretrained = "tokenizer_pretrained"
-    model_pretrained = "model_pretrained"
-    peft_config = "peft_config"
-    data_collator = "data_collator"
-    beta = "beta"
-
-
-# ----------------------from MLRUN--------------------------------
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: DPOTrainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-        def wrapper(self: DPOTrainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-        if self._is_training:
-            return
-
-    def log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self.log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def apply_mlrun(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    """
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-# ----------------------end from MLRUN--------------------------------
-
-
-def _print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
-        f" {100 * trainable_params / all_param}"
-    )
-
-
-# default configs
-# will be used if user provides "True" with config name as input
-QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-
-LORA_CONFIG = peft.LoraConfig(
-    r=8,
-    lora_alpha=32,
-    target_modules=["query_key_value"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-DEEPSPEED_CONFIG = {
-    "train_micro_batch_size_per_gpu": "auto",
-    "fp16": {"enabled": True},
-    "autotuning": {
-        "enabled": True,
-        "arg_mappings": {
-            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
-        },
-    },
-    "zero_optimization": {
-        "stage": 2,
-    },
-}
-
-
-def _update_config(src: dict, dst: dict):
-    """
-    update configs according to user, this way the user can add/modify values in default configs for e.g.
-
-    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
-     with the prefix and add them to appropriate config
-
-    :param src: dict of all candidate values to update dict.
-    :param dst: dict containing all configs to update.
-    """
-
-    for config_name, config in dst.items():
-
-        # If given True we use default dict
-        # Can also be False or a config dict given from user, so we check specifically fo True
-        if config is True and config_name == "quantization":
-            config = QUANTIZATION_CONFIG
-
-        if config is True and config_name == "lora":
-            config = LORA_CONFIG
-
-        if config is True and config_name == "deepspeed":
-            config = DEEPSPEED_CONFIG
-
-        # in some cases we can get a boolean value, in that case no need to look for args
-        if isinstance(config, bool):
-            config = None
-
-        elif isinstance(config, dict):
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    config[key.replace(f"{config_name}_", "")] = val
-
-        # update by config name
-        else:
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    setattr(config, key.replace(f"{config_name}_", ""), val)
-
-        dst.update({config_name: config})
-
-
-def _get_class_object(class_path: str) -> type:
-    """
-    given a full class name, this function returns the correct class
-
-    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
-
-    :return the wanted class object
-    """
-    module_path, class_name = class_path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-
-
-def _set_model_and_tokenizer(
-    model: Union[str, List[str]],
-    tokenizer: Union[str, List[str]],
-    task: str,
-    framework: str,
-    quantization_config: dict,
-    use_cuda: bool,
-    tokenizer_pretrained_config,
-    model_pretrained_config,
-    device_map: str,
-):
-    """
-    get the correct model and tokenizer according to given user inputs
-
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param task: a supported nlp task, used to choose model if not provided
-    :param framework: pt or tf
-    :param quantization_config: quantization config or None, to load model in appropriate way
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param device_map: a device map for model training if using number of gpu's
-
-    :returns: model and tokenizer
-    """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
-    # load model from store
-    if isinstance(model, str) and is_store_uri(model):
-        pass
-        # TODO: load both model and tokenizer and return, need guy's help
-
-    # if it's a tuple them we assume it contains of both name and class
-    if isinstance(model, list):
-        model_name, model_class = model
-        model_class = _get_class_object(model_class)
-
-    # in the case we don't get the model class we need the task in order to choose the correct model
-    else:
-        if task is None:
-            logger.error("task must be chosen in order to determine the correct model")
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        _, available_classes, task_options = transformers.pipelines.check_task(task)
-
-        if isinstance(model, str):
-            model_name = model
-
-        # if model is not given, we take the default model for the given task
-        else:
-            model_name, _ = transformers.pipelines.get_default_model_and_revision(
-                available_classes, framework, task_options
-            )
-        if not available_classes.get(framework, tuple()):
-            logger.error(
-                "given task's default model is not supported in specified framework"
-            )
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        model_class = available_classes[framework][0]
-
-    # load the pretrained model
-    if use_cuda:
-        device_map = device_map
-    else:
-        device_map = None
-
-    model = model_class.from_pretrained(
-        model_name,
-        quantization_config=quantization_config,
-        device_map=device_map,
-        **model_pretrained_config,
-    )
-
-    # If quantization config is given we will load a quantized model, if not a regular one
-    if quantization_config:
-        model.gradient_checkpointing_enable()
-        model = peft.prepare_model_for_kbit_training(model)
-
-    # if not specified we choose the default tokenizer that corresponding to the model
-    if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-        return model_name, model, tokenizer
-
-    if isinstance(tokenizer, str):
-        tokenizer_name = tokenizer
-        tokenizer_class = transformers.AutoTokenizer
-
-    # if it's not a str then it's a tuple of both name and class
-    else:
-        tokenizer_name, tokenizer_class = tokenizer
-        tokenizer_class = _get_class_object(tokenizer_class)
-
-    tokenizer = tokenizer_class.from_pretrained(
-        tokenizer_name, **tokenizer_pretrained_config
-    )
-
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model_name, model, tokenizer
-
-
-def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
-    """
-    loads the specific dataset provided by the user
-
-    :param dataset: name or path of dataset to load
-    :param is_train: bool that indicates the purpose of the dataset
-    :param kwargs: other kwargs for loading the dataset
-
-    :returns: loaded dataset
-    """
-    # if split in kwargs then the user decides how to split the dataset
-    if "split" in kwargs:
-        return load_dataset(dataset, **kwargs)
-
-    # if it's a dataset for train we split with train
-    if is_train:
-        return load_dataset(dataset, split="train", **kwargs)
-
-    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
-    dataset = load_dataset(dataset, **kwargs)
-    if "test" in dataset:
-        return dataset.get("test")
-    elif "eval" in dataset:
-        return dataset.get("eval")
-    elif "validation" in dataset:
-        return dataset.get("validation")
-
-
-def _prepare_dataset(
-    train_dataset: str,
-    eval_dataset: str,
-    train_load_dataset_kwargs,
-    eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
-) -> (Dataset, Union[Dataset, None]):
-    """
-    Loads the train and eval datasets (if provided) passes them through the tokenizer and
-    returns them ready to use in training
-
-    :param train_dataset: the name or path to the train dataset
-    :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
-
-    :returns: tokenized datasets
-    """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
-
-    # Load datasets
-    # if provided two paths/names we load each separately using designated func
-    if eval_dataset:
-        train_dataset = _dataset_loader(
-            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
-        )
-        eval_dataset = _dataset_loader(
-            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
-        )
-
-    # if only on path is given then we must check if it contains both dataset or if only one should be used
-    else:
-        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
-        if "train" in dataset:
-            train_dataset = dataset.get("train")
-            if "test" in dataset:
-                eval_dataset = dataset.get("test")
-            elif "eval" in dataset:
-                eval_dataset = dataset.get("eval")
-            elif "validation" in dataset:
-                eval_dataset = dataset.get("validation")
-            else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
-        else:
-            logger.error("train dataset is mandatory")
-            raise KeyError("no train dataset found in given dataset")
-
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
-
-
-def finetune_llm(
-    context: mlrun.MLClientCtx,
-    train_dataset: Union[str, mlrun.datastore.DataItem],
-    eval_dataset: str = None,
-    train_load_dataset_kwargs: dict = {},
-    eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
-    model: Union[str, List[str]] = "huggingface-model",
-    tokenizer: Union[str, List[str]] = None,
-    deepspeed_config: Union[dict, bool] = False,
-    quantization_config: Union[dict, bool] = False,
-    peft_config: Union[dict, bool] = False,
-    beta: Union[float, bool] = False,
-    training_config: dict = {},
-    model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
-    task: str = "text-generation",
-    use_cuda: bool = True,
-    framework: str = "pt",
-    device_map: str = "auto",
-    **kwargs,
-):
-    """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
-     The function takes various configuration parameters to customize the training process
-     and adapt the model to specific tasks using a provided dataset.
-
-    :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param train_dataset: The train dataset used for fine-tuning the language model.
-    :param eval_dataset: The eval dataset used for evaluate the language model during training.
-    :param deepspeed_config: Configuration options for DeepSpeed (optional).
-    :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
-    :param training_config: Configuration options specific to the fine-tuning training process (optional).
-    :param data_collator_config: Configuration options for data collation during training (optional).
-    :param task: A description of the specific task the model is being fine-tuned for.
-    :param kwargs: Additional keyword arguments.
-    """
-
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
-    # Look for updates to configs given in kwargs
-    configs = {
-        ConfigKeys.deepspeed: deepspeed_config,
-        ConfigKeys.quantization: quantization_config,
-        ConfigKeys.training: training_config,
-        ConfigKeys.model_pretrained: model_pretrained_config,
-        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
-        ConfigKeys.data_collator: data_collator_config,
-        ConfigKeys.peft_config: peft_config,
-        ConfigKeys.beta: beta,
-    }
-    _update_config(dst=configs, src=kwargs)
-
-    # check gpu permission and availability
-    if use_cuda:
-        if torch.cuda.is_available():
-            # Clean gpu cache
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("'use_cuda' is set to True, but no cuda device is available")
-
-    # get model and tokenizer
-    model_name, model, tokenizer = _set_model_and_tokenizer(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        framework=framework,
-        quantization_config=configs[ConfigKeys.quantization],
-        use_cuda=use_cuda,
-        tokenizer_pretrained_config=tokenizer_pretrained_config,
-        model_pretrained_config=configs[ConfigKeys.model_pretrained],
-        device_map=device_map,
-    )
-
-    # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
-
-    # Initialize training kwargs from user kwargs:
-    train_kwargs = configs[ConfigKeys.training]
-
-    # If deepspeed config given we add it to training kwargs
-    if configs[ConfigKeys.deepspeed]:
-        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
-
-    # Take a look at the trainable parameters in the model
-    _print_trainable_parameters(model)
-
-    # Preparing training arguments:
-    training_args = transformers.TrainingArguments(
-        output_dir=tempfile.mkdtemp(),
-        **train_kwargs,
-    )
-
-    trainer = trl.DPOTrainer(
-        model=model,
-        ref_model = None,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
-        peft_config=configs[ConfigKeys.peft_config],
-        beta = configs[ConfigKeys.beta],
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        args=training_args,
-    )
-
-    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-    temp_directory = tempfile.TemporaryDirectory().name
-    trainer.save_model(temp_directory)
-
-    # Zip the model directory:
-    shutil.make_archive(
-        base_name="model",
-        format="zip",
-        root_dir=temp_directory,
-    )
-
-    # Log the model:
-    context.log_model(
-        key="model",
-        db_key=model_name.split("/")[-1],
-        model_file="model.zip",
-        tag="",
-        framework="Hugging Face",
-    )
-
-
-def evaluate(
-    context,
-    model_path,
-    data: pd.DataFrame,
-    model_name: str = None,
-    tokenizer_name: str = None,
-):
-    """
-    Evaluating the model using perplexity, for more information visit:
-    https://huggingface.co/docs/transformers/perplexity
-
-    :param context:     mlrun context
-    :param model_path:  path to the model directory
-    :param data:        the data to evaluate the model
-    :param model_name:  name of base model
-    :param tokenizer_name: name of base tokenizer
-    """
-    # Get the model artifact and file:
-    (
-        model_file,
-        model_artifact,
-        extra_data,
-    ) = mlrun.artifacts.get_model(model_path)
-
-    # Read the name:
-    _model_name = model_artifact.spec.db_key
-
-    # Extract logged model files:
-    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_directory)
-
-    # Loading the saved pretrained tokenizer and model:
-    dataset = Dataset.from_pandas(data)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
-    )
-    model = PeftModel.from_pretrained(model, model_directory)
-    model.eval()
-    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
-
-    max_length = 1024
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-
-        with torch.no_grad():
-            outputs = model(input_ids.cuda(), labels=target_ids)
-
-            # loss is calculated using CrossEntropyLoss which averages over valid labels
-            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-            # to the left by 1.
-            neg_log_likelihood = outputs.loss
-
-        nlls.append(neg_log_likelihood)
-
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-
-    ppl = torch.exp(torch.stack(nlls).mean()).item()
-    context.log_result("perplexity", ppl)

From 1cb999eb3c4b06f7a39614384ae3a0368f5a6420 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Sun, 17 Mar 2024 21:13:37 -0700
Subject: [PATCH 09/33] added dpo trainer

---
 huggingface_dpo/huggingface_dpo.py | 855 +++++++++++++++++++++++++++++
 1 file changed, 855 insertions(+)
 create mode 100644 huggingface_dpo/huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
new file mode 100644
index 000000000..a8c46b768
--- /dev/null
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -0,0 +1,855 @@
+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from trl import DPOTrainer
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+class ConfigKeys:
+    deepspeed = "deepspeed"
+    quantization = "quantization"
+    training = "training"
+    tokenizer_pretrained = "tokenizer_pretrained"
+    model_pretrained = "model_pretrained"
+    peft_config = "peft_config"
+    data_collator = "data_collator"
+    beta = "beta"
+
+
+# ----------------------from MLRUN--------------------------------
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: DPOTrainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+        def wrapper(self: DPOTrainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+        if self._is_training:
+            return
+
+    def log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self.log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def apply_mlrun(
+    trainer: transformers.Trainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    """
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+# ----------------------end from MLRUN--------------------------------
+
+
+def _print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
+        f" {100 * trainable_params / all_param}"
+    )
+
+
+# default configs
+# will be used if user provides "True" with config name as input
+QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+LORA_CONFIG = peft.LoraConfig(
+    r=8,
+    lora_alpha=32,
+    target_modules=["query_key_value"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+DEEPSPEED_CONFIG = {
+    "train_micro_batch_size_per_gpu": "auto",
+    "fp16": {"enabled": True},
+    "autotuning": {
+        "enabled": True,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
+        },
+    },
+    "zero_optimization": {
+        "stage": 2,
+    },
+}
+
+
+def _update_config(src: dict, dst: dict):
+    """
+    update configs according to user, this way the user can add/modify values in default configs for e.g.
+
+    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
+     with the prefix and add them to appropriate config
+
+    :param src: dict of all candidate values to update dict.
+    :param dst: dict containing all configs to update.
+    """
+
+    for config_name, config in dst.items():
+
+        # If given True we use default dict
+        # Can also be False or a config dict given from user, so we check specifically fo True
+        if config is True and config_name == "quantization":
+            config = QUANTIZATION_CONFIG
+
+        if config is True and config_name == "lora":
+            config = LORA_CONFIG
+
+        if config is True and config_name == "deepspeed":
+            config = DEEPSPEED_CONFIG
+
+        # in some cases we can get a boolean value, in that case no need to look for args
+        if isinstance(config, bool):
+            config = None
+
+        elif isinstance(config, dict):
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    config[key.replace(f"{config_name}_", "")] = val
+
+        # update by config name
+        else:
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    setattr(config, key.replace(f"{config_name}_", ""), val)
+
+        dst.update({config_name: config})
+
+
+def _get_class_object(class_path: str) -> type:
+    """
+    given a full class name, this function returns the correct class
+
+    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
+
+    :return the wanted class object
+    """
+    module_path, class_name = class_path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _set_model_and_tokenizer(
+    model: Union[str, List[str]],
+    tokenizer: Union[str, List[str]],
+    task: str,
+    framework: str,
+    quantization_config: dict,
+    use_cuda: bool,
+    tokenizer_pretrained_config,
+    model_pretrained_config,
+    device_map: str,
+):
+    """
+    get the correct model and tokenizer according to given user inputs
+
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param task: a supported nlp task, used to choose model if not provided
+    :param framework: pt or tf
+    :param quantization_config: quantization config or None, to load model in appropriate way
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param device_map: a device map for model training if using number of gpu's
+
+    :returns: model and tokenizer
+    """
+    # if task is not supported and no model was given we can't choose one
+    if task and task not in supported_tasks and not model:
+        logger.error("unsupported task option chosen")
+        raise
+
+    # load model from store
+    if isinstance(model, str) and is_store_uri(model):
+        pass
+        # TODO: load both model and tokenizer and return, need guy's help
+
+    # if it's a tuple them we assume it contains of both name and class
+    if isinstance(model, list):
+        model_name, model_class = model
+        model_class = _get_class_object(model_class)
+
+    # in the case we don't get the model class we need the task in order to choose the correct model
+    else:
+        if task is None:
+            logger.error("task must be chosen in order to determine the correct model")
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        _, available_classes, task_options = transformers.pipelines.check_task(task)
+
+        if isinstance(model, str):
+            model_name = model
+
+        # if model is not given, we take the default model for the given task
+        else:
+            model_name, _ = transformers.pipelines.get_default_model_and_revision(
+                available_classes, framework, task_options
+            )
+        if not available_classes.get(framework, tuple()):
+            logger.error(
+                "given task's default model is not supported in specified framework"
+            )
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        model_class = available_classes[framework][0]
+
+    # load the pretrained model
+    if use_cuda:
+        device_map = device_map
+    else:
+        device_map = None
+
+    model = model_class.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        **model_pretrained_config,
+    )
+
+    # If quantization config is given we will load a quantized model, if not a regular one
+    if quantization_config:
+        model.gradient_checkpointing_enable()
+        model = peft.prepare_model_for_kbit_training(model)
+
+    # if not specified we choose the default tokenizer that corresponding to the model
+    if tokenizer is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+        return model_name, model, tokenizer
+
+    if isinstance(tokenizer, str):
+        tokenizer_name = tokenizer
+        tokenizer_class = transformers.AutoTokenizer
+
+    # if it's not a str then it's a tuple of both name and class
+    else:
+        tokenizer_name, tokenizer_class = tokenizer
+        tokenizer_class = _get_class_object(tokenizer_class)
+
+    tokenizer = tokenizer_class.from_pretrained(
+        tokenizer_name, **tokenizer_pretrained_config
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model_name, model, tokenizer
+
+
+def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
+    """
+    loads the specific dataset provided by the user
+
+    :param dataset: name or path of dataset to load
+    :param is_train: bool that indicates the purpose of the dataset
+    :param kwargs: other kwargs for loading the dataset
+
+    :returns: loaded dataset
+    """
+    # if split in kwargs then the user decides how to split the dataset
+    if "split" in kwargs:
+        return load_dataset(dataset, **kwargs)
+
+    # if it's a dataset for train we split with train
+    if is_train:
+        return load_dataset(dataset, split="train", **kwargs)
+
+    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
+    dataset = load_dataset(dataset, **kwargs)
+    if "test" in dataset:
+        return dataset.get("test")
+    elif "eval" in dataset:
+        return dataset.get("eval")
+    elif "validation" in dataset:
+        return dataset.get("validation")
+
+
+def _prepare_dataset(
+    train_dataset: str,
+    eval_dataset: str,
+    train_load_dataset_kwargs,
+    eval_load_dataset_kwargs,
+    tokenizer,
+    dataset_columns_to_train: Union[str, list],
+) -> (Dataset, Union[Dataset, None]):
+    """
+    Loads the train and eval datasets (if provided) passes them through the tokenizer and
+    returns them ready to use in training
+
+    :param train_dataset: the name or path to the train dataset
+    :param eval_dataset: the name or path to the eval dataset
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+                                        (need to pass through the tokenizer first)
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param tokenizer: the tokenizer to pass the data through
+
+    :returns: tokenized datasets
+    """
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # we take col name/s in a list for easy generalization
+    if isinstance(dataset_columns_to_train, str):
+        dataset_columns_to_train = [dataset_columns_to_train]
+
+    if isinstance(train_dataset, mlrun.datastore.DataItem):
+        train_dataset = Dataset.from_pandas(train_dataset.as_df())
+        return (
+            train_dataset.map(
+                lambda examples: tokenizer(
+                    *[examples[col] for col in dataset_columns_to_train],
+                    truncation=True,
+                    padding=True,
+                ),
+                batched=True,
+            ),
+            None,
+        )
+
+    # Load datasets
+    # if provided two paths/names we load each separately using designated func
+    if eval_dataset:
+        train_dataset = _dataset_loader(
+            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
+        )
+        eval_dataset = _dataset_loader(
+            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
+        )
+
+    # if only on path is given then we must check if it contains both dataset or if only one should be used
+    else:
+        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
+        if "train" in dataset:
+            train_dataset = dataset.get("train")
+            if "test" in dataset:
+                eval_dataset = dataset.get("test")
+            elif "eval" in dataset:
+                eval_dataset = dataset.get("eval")
+            elif "validation" in dataset:
+                eval_dataset = dataset.get("validation")
+            else:
+                # only train dataset given, tokenize and return it
+                return (
+                    train_dataset.map(
+                        lambda examples: tokenizer(
+                            *[examples[col] for col in dataset_columns_to_train],
+                            truncation=True,
+                            padding=True,
+                        ),
+                        batched=True,
+                    ),
+                    None,
+                )
+        else:
+            logger.error("train dataset is mandatory")
+            raise KeyError("no train dataset found in given dataset")
+
+    # Tokenize the data so the model can understand it
+    tokenized_train_dataset = train_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    tokenized_eval_dataset = eval_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    return tokenized_train_dataset, tokenized_eval_dataset
+
+
+def finetune_llm(
+    context: mlrun.MLClientCtx,
+    train_dataset: Union[str, mlrun.datastore.DataItem],
+    eval_dataset: str = None,
+    train_load_dataset_kwargs: dict = {},
+    eval_load_dataset_kwargs: dict = {},
+    dataset_columns_to_train: Union[str, list] = "text",
+    model: Union[str, List[str]] = "huggingface-model",
+    tokenizer: Union[str, List[str]] = None,
+    deepspeed_config: Union[dict, bool] = False,
+    quantization_config: Union[dict, bool] = False,
+    peft_config: Union[dict, bool] = False,
+    beta: Union[float, bool] = False,
+    training_config: dict = {},
+    model_pretrained_config: dict = {},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
+    task: str = "text-generation",
+    use_cuda: bool = True,
+    framework: str = "pt",
+    device_map: str = "auto",
+    **kwargs,
+):
+    """
+    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+     The function takes various configuration parameters to customize the training process
+     and adapt the model to specific tasks using a provided dataset.
+
+    :param context: mlrun context in order to log trained model
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param framework: pt ot tf
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param train_dataset: The train dataset used for fine-tuning the language model.
+    :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param deepspeed_config: Configuration options for DeepSpeed (optional).
+    :param quantization_config: Configuration options for model quantization (optional).
+    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param data_collator_config: Configuration options for data collation during training (optional).
+    :param task: A description of the specific task the model is being fine-tuned for.
+    :param kwargs: Additional keyword arguments.
+    """
+
+    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
+    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
+
+    # Look for updates to configs given in kwargs
+    configs = {
+        ConfigKeys.deepspeed: deepspeed_config,
+        ConfigKeys.quantization: quantization_config,
+        ConfigKeys.training: training_config,
+        ConfigKeys.model_pretrained: model_pretrained_config,
+        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
+        ConfigKeys.data_collator: data_collator_config,
+        ConfigKeys.peft_config: peft_config,
+        ConfigKeys.beta: beta,
+    }
+    _update_config(dst=configs, src=kwargs)
+
+    # check gpu permission and availability
+    if use_cuda:
+        if torch.cuda.is_available():
+            # Clean gpu cache
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("'use_cuda' is set to True, but no cuda device is available")
+
+    # get model and tokenizer
+    model_name, model, tokenizer = _set_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        task=task,
+        framework=framework,
+        quantization_config=configs[ConfigKeys.quantization],
+        use_cuda=use_cuda,
+        tokenizer_pretrained_config=tokenizer_pretrained_config,
+        model_pretrained_config=configs[ConfigKeys.model_pretrained],
+        device_map=device_map,
+    )
+
+    # Load datasets
+    tokenized_train, tokenized_eval = _prepare_dataset(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        train_load_dataset_kwargs=train_load_dataset_kwargs,
+        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+        tokenizer=tokenizer,
+        dataset_columns_to_train=dataset_columns_to_train,
+    )
+
+    # Initialize the data collator for the trainer to use in order to create batches of data
+    data_collator = transformers.DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False, **data_collator_config
+    )
+
+    # Initialize training kwargs from user kwargs:
+    train_kwargs = configs[ConfigKeys.training]
+
+    # If deepspeed config given we add it to training kwargs
+    if configs[ConfigKeys.deepspeed]:
+        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
+
+    # Take a look at the trainable parameters in the model
+    _print_trainable_parameters(model)
+
+    # Preparing training arguments:
+    training_args = transformers.TrainingArguments(
+        output_dir=tempfile.mkdtemp(),
+        **train_kwargs,
+    )
+
+    trainer = trl.DPOTrainer(
+        model=model,
+        ref_model = None,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        peft_config=configs[ConfigKeys.peft_config],
+        beta = configs[ConfigKeys.beta],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=training_args,
+    )
+
+    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
+    model.config.use_cache = (
+        False  # silence the warnings. Please re-enable for inference!
+    )
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+    temp_directory = tempfile.TemporaryDirectory().name
+    trainer.save_model(temp_directory)
+
+    # Zip the model directory:
+    shutil.make_archive(
+        base_name="model",
+        format="zip",
+        root_dir=temp_directory,
+    )
+
+    # Log the model:
+    context.log_model(
+        key="model",
+        db_key=model_name.split("/")[-1],
+        model_file="model.zip",
+        tag="",
+        framework="Hugging Face",
+    )
+
+
+def evaluate(
+    context,
+    model_path,
+    data: pd.DataFrame,
+    model_name: str = None,
+    tokenizer_name: str = None,
+):
+    """
+    Evaluating the model using perplexity, for more information visit:
+    https://huggingface.co/docs/transformers/perplexity
+
+    :param context:     mlrun context
+    :param model_path:  path to the model directory
+    :param data:        the data to evaluate the model
+    :param model_name:  name of base model
+    :param tokenizer_name: name of base tokenizer
+    """
+    # Get the model artifact and file:
+    (
+        model_file,
+        model_artifact,
+        extra_data,
+    ) = mlrun.artifacts.get_model(model_path)
+
+    # Read the name:
+    _model_name = model_artifact.spec.db_key
+
+    # Extract logged model files:
+    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_directory)
+
+    # Loading the saved pretrained tokenizer and model:
+    dataset = Dataset.from_pandas(data)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
+    )
+    model = PeftModel.from_pretrained(model, model_directory)
+    model.eval()
+    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
+
+    max_length = 1024
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids.cuda(), labels=target_ids)
+
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
+            neg_log_likelihood = outputs.loss
+
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    ppl = torch.exp(torch.stack(nlls).mean()).item()
+    context.log_result("perplexity", ppl)

From 7e6af5fc35c1da8ae466a8c45d7d8cc84762edeb Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 18 Mar 2024 23:05:49 +0000
Subject: [PATCH 10/33] continue the coding

---
 huggingface_dpo/huggingface_dpo.py      | 9 +++++----
 huggingface_dpo/test_huggingface_dpo.py | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
index a8c46b768..31e418f30 100644
--- a/huggingface_dpo/huggingface_dpo.py
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -244,7 +244,7 @@ def log_metric_plot(self, name: str, scores: List[float]):
 
 
 def apply_mlrun(
-    trainer: transformers.Trainer,
+    trainer: trl.DPOTrainer,
     model_name: str = None,
     tag: str = "",
     context: mlrun.MLClientCtx = None,
@@ -302,10 +302,11 @@ def _print_trainable_parameters(model):
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
 
-LORA_CONFIG = peft.LoraConfig(
+PEFT_CONFIG = peft.LoraConfig(
     r=8,
-    lora_alpha=32,
-    target_modules=["query_key_value"],
+    lora_alpha=16,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj"],
     lora_dropout=0.05,
     bias="none",
     task_type="CAUSAL_LM",
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
index 53576e4e7..691605c83 100644
--- a/huggingface_dpo/test_huggingface_dpo.py
+++ b/huggingface_dpo/test_huggingface_dpo.py
@@ -5,7 +5,7 @@
 
 def test_train():
 
-    model_name = "distilgpt2"
+    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name
     auto_trainer = mlrun.import_function("function.yaml")
 

From d4e0940dbd7b1aa45b66995980ef4542004cd94f Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 16:37:55 +0000
Subject: [PATCH 11/33] should be the same as trainer

---
 huggingface_dpo/huggingface_dpo.py      |  4 ++--
 huggingface_dpo/item.yaml               | 23 +++++++++++++++++++++++
 huggingface_dpo/test_huggingface_dpo.py |  2 +-
 3 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 huggingface_dpo/item.yaml

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
index 31e418f30..5f2a680d0 100644
--- a/huggingface_dpo/huggingface_dpo.py
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -347,7 +347,7 @@ def _update_config(src: dict, dst: dict):
             config = QUANTIZATION_CONFIG
 
         if config is True and config_name == "lora":
-            config = LORA_CONFIG
+            config = PEFT_CONFIG
 
         if config is True and config_name == "deepspeed":
             config = DEEPSPEED_CONFIG
@@ -624,7 +624,7 @@ def _prepare_dataset(
     return tokenized_train_dataset, tokenized_eval_dataset
 
 
-def finetune_llm(
+def dpo_train(
     context: mlrun.MLClientCtx,
     train_dataset: Union[str, mlrun.datastore.DataItem],
     eval_dataset: str = None,
diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml
new file mode 100644
index 000000000..4f6cc1c1c
--- /dev/null
+++ b/huggingface_dpo/item.yaml
@@ -0,0 +1,23 @@
+
+apiVersion: v1
+categories: []         # List of category names
+description: ''        # Short description
+doc: ''                # Path to README.md if exists
+example: ''            # Path to examole notebook
+generationDate: 2024-03-19 16:26:27.342027
+icon: ''               # Path to icon file
+labels: {}             # Key values label pairs
+maintainers: []        # List of maintainers
+mlrunVersion: ''       # Function’s MLRun version requirement, should follow python’s versioning schema
+name: ''               # Function name
+platformVersion: ''    # Function’s Iguazio version requirement, should follow python’s versioning schema
+spec:
+  filename: ''         # Implementation file
+  handler: ''          # Handler function name
+  image: ''            # Base image name
+  kind: ''             # Function kind
+  requirements: []     # List of Pythonic library requirements
+  customFields: {}   # Custom spec fields
+  env: []              # Spec environment params
+url: ''
+version: 0.0.1         # Function version, should follow standard semantic versioning schema
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
index 691605c83..adf70b494 100644
--- a/huggingface_dpo/test_huggingface_dpo.py
+++ b/huggingface_dpo/test_huggingface_dpo.py
@@ -33,7 +33,7 @@ def test_train():
             auto_trainer.run(
                 local=True,
                 params=params,
-                handler="finetune_llm",
+                handler="dpo_train",
                 returns=["model"],
                 workdir=test_directory,
             )

From 1c26ef12b940c71a634bc9a75fa1ef0dccc38ef4 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 16:41:35 +0000
Subject: [PATCH 12/33] try generate the function.yaml

---
 huggingface_dpo/test_huggingface_dpo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
index adf70b494..7899debba 100644
--- a/huggingface_dpo/test_huggingface_dpo.py
+++ b/huggingface_dpo/test_huggingface_dpo.py
@@ -3,7 +3,7 @@
 import mlrun
 
 
-def test_train():
+def test_dpo_train():
 
     model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name

From 93beb7bcb7bbb5c0bea3291ad96bbb92f742a927 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 17:04:21 +0000
Subject: [PATCH 13/33] adding the dop_trainer

---
 .../{huggingface_dpo.py => huggingface_dpo_trainer.py}            | 0
 .../{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py}  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename huggingface_dpo/{huggingface_dpo.py => huggingface_dpo_trainer.py} (100%)
 rename huggingface_dpo/{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py} (100%)

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo_trainer.py
similarity index 100%
rename from huggingface_dpo/huggingface_dpo.py
rename to huggingface_dpo/huggingface_dpo_trainer.py
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
similarity index 100%
rename from huggingface_dpo/test_huggingface_dpo.py
rename to huggingface_dpo/test_huggingface_dpo_trainer.py

From a3c78626af0afe37469abb975ece3bc3c8da3de7 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 17:05:09 +0000
Subject: [PATCH 14/33] update item

---
 huggingface_dpo/item.yaml | 40 +++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml
index 4f6cc1c1c..3eff1eede 100644
--- a/huggingface_dpo/item.yaml
+++ b/huggingface_dpo/item.yaml
@@ -1,23 +1,23 @@
-
 apiVersion: v1
-categories: []         # List of category names
-description: ''        # Short description
-doc: ''                # Path to README.md if exists
-example: ''            # Path to examole notebook
-generationDate: 2024-03-19 16:26:27.342027
-icon: ''               # Path to icon file
-labels: {}             # Key values label pairs
-maintainers: []        # List of maintainers
-mlrunVersion: ''       # Function’s MLRun version requirement, should follow python’s versioning schema
-name: ''               # Function name
-platformVersion: ''    # Function’s Iguazio version requirement, should follow python’s versioning schema
+categories:
+- machine-learning
+- model-training
+description: doing the alignment with dpo trainer
+doc: ''
+example: huggingface_dpo_trainer.ipynb
+generationDate: 2024-03-19:09-25
+hidden: false
+icon: ''
+labels:
+  author: pgw
+maintainers: []
+marketplaceType: ''
+name: huggingface-dpo-trainer
 spec:
-  filename: ''         # Implementation file
-  handler: ''          # Handler function name
-  image: ''            # Base image name
-  kind: ''             # Function kind
-  requirements: []     # List of Pythonic library requirements
-  customFields: {}   # Custom spec fields
-  env: []              # Spec environment params
+  filename: huggingface_dpo_trainer.py
+  handler: dpo_train
+  image: mlrun/mlrun
+  kind: job
+  requirements: []
 url: ''
-version: 0.0.1         # Function version, should follow standard semantic versioning schema
+version: 1.0.0

From e44d87007a404d2f40f6a45be66d0b1977ec2887 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 17:07:16 +0000
Subject: [PATCH 15/33] add function yaml file

---
 huggingface_dpo/function.yaml | 374 ++++++++++++++++++++++++++++++++++
 1 file changed, 374 insertions(+)
 create mode 100644 huggingface_dpo/function.yaml

diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml
new file mode 100644
index 000000000..d0baab33a
--- /dev/null
+++ b/huggingface_dpo/function.yaml
@@ -0,0 +1,374 @@
+kind: job
+metadata:
+  name: huggingface-dpo-trainer
+  tag: ''
+  hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676
+  project: ''
+  labels:
+    author: pgw
+  categories:
+  - machine-learning
+  - model-training
+spec:
+  command: ''
+  args: []
+  image: mlrun/mlrun
+  build:
+    functionSourceCode: aW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgKExvcmFDb25maWcsIFBlZnRNb2RlbCwgZ2V0X3BlZnRfbW9kZWwsCiAgICAgICAgICAgICAgICAgIHByZXBhcmVfbW9kZWxfZm9yX2tiaXRfdHJhaW5pbmcpCmZyb20gcGxvdGx5IGltcG9ydCBncmFwaF9vYmplY3RzIGFzIGdvCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCAoQXV0b01vZGVsRm9yQ2F1c2FsTE0sIEF1dG9Ub2tlbml6ZXIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgQml0c0FuZEJ5dGVzQ29uZmlnLCBEYXRhQ29sbGF0b3JGb3JMYW5ndWFnZU1vZGVsaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgIFByZVRyYWluZWRNb2RlbCwgUHJlVHJhaW5lZFRva2VuaXplciwgVHJhaW5lciwKICAgICAgICAgICAgICAgICAgICAgICAgICBUcmFpbmVyQ2FsbGJhY2ssIFRyYWluZXJDb250cm9sLCBUcmFpbmVyU3RhdGUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgVHJhaW5pbmdBcmd1bWVudHMpCgpzdXBwb3J0ZWRfdGFza3MgPSBbCiAgICAicXVlc3Rpb24tYW5zd2VyaW5nIiwKICAgICJzdW1tYXJpemF0aW9uIiwKICAgICJ0YWJsZS1xdWVzdGlvbi1hbnN3ZXJpbmciLAogICAgInRleHQydGV4dC1nZW5lcmF0aW9uIiwKICAgICJ0ZXh0LWNsYXNzaWZpY2F0aW9uIiwKICAgICJzZW50aW1lbnQtYW5hbHlzaXMiLAogICAgInRleHQtZ2VuZXJhdGlvbiIsCiAgICAidG9rZW4tY2xhc3NpZmljYXRpb24iLAogICAgInRyYW5zbGF0aW9uIiwKICAgICJ0cmFuc2xhdGlvbl94eF90b195eSIsCl0KCgpjbGFzcyBDb25maWdLZXlzOgogICAgZGVlcHNwZWVkID0gImRlZXBzcGVlZCIKICAgIHF1YW50aXphdGlvbiA9ICJxdWFudGl6YXRpb24iCiAgICB0cmFpbmluZyA9ICJ0cmFpbmluZyIKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkID0gInRva2VuaXplcl9wcmV0cmFpbmVkIgogICAgbW9kZWxfcHJldHJhaW5lZCA9ICJtb2RlbF9wcmV0cmFpbmVkIgogICAgcGVmdF9jb25maWcgPSAicGVmdF9jb25maWciCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiB0cmwuRFBPVHJhaW5lciwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0YWc6IHN0ciA9ICIiLAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHggPSBOb25lLAogICAgYXV0b19sb2c6IGJvb2wgPSBUcnVlLAogICAgbGFiZWxzOiBEaWN0W3N0ciwgc3RyXSA9IE5vbmUsCiAgICBleHRyYV9kYXRhOiBkaWN0ID0gTm9uZSwKICAgICoqa3dhcmdzLAopOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgIiIiCiAgICAjIEdldCBwYXJhbWV0ZXJzIGRlZmF1bHRzOgogICAgaWYgY29udGV4dCBpcyBOb25lOgogICAgICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5ERUZBVUxUX0NPTlRFWFRfTkFNRSkKCiAgICBIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5hZGRfaW50ZXJmYWNlKG9iaj10cmFpbmVyKQoKICAgIGlmIGF1dG9fbG9nOgogICAgICAgIHRyYWluZXIuYWRkX2NhbGxiYWNrKAogICAgICAgICAgICBNTFJ1bkNhbGxiYWNrKAogICAgICAgICAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgICAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgICAgICAgICAgdGFnPXRhZywKICAgICAgICAgICAgICAgIGxhYmVscz1sYWJlbHMsCiAgICAgICAgICAgICAgICBleHRyYV9kYXRhPWV4dHJhX2RhdGEsCiAgICAgICAgICAgICkKICAgICAgICApCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZW5kIGZyb20gTUxSVU4tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQoKCmRlZiBfcHJpbnRfdHJhaW5hYmxlX3BhcmFtZXRlcnMobW9kZWwpOgogICAgIiIiCiAgICBQcmludHMgdGhlIG51bWJlciBvZiB0cmFpbmFibGUgcGFyYW1ldGVycyBpbiB0aGUgbW9kZWwuCiAgICAiIiIKICAgIHRyYWluYWJsZV9wYXJhbXMgPSAwCiAgICBhbGxfcGFyYW0gPSAwCiAgICBmb3IgXywgcGFyYW0gaW4gbW9kZWwubmFtZWRfcGFyYW1ldGVycygpOgogICAgICAgIGFsbF9wYXJhbSArPSBwYXJhbS5udW1lbCgpCiAgICAgICAgaWYgcGFyYW0ucmVxdWlyZXNfZ3JhZDoKICAgICAgICAgICAgdHJhaW5hYmxlX3BhcmFtcyArPSBwYXJhbS5udW1lbCgpCiAgICBwcmludCgKICAgICAgICBmInRyYWluYWJsZSBwYXJhbXM6IHt0cmFpbmFibGVfcGFyYW1zfSB8fCBhbGwgcGFyYW1zOiB7YWxsX3BhcmFtfSB8fCB0cmFpbmFibGUlOiIKICAgICAgICBmIiB7MTAwICogdHJhaW5hYmxlX3BhcmFtcyAvIGFsbF9wYXJhbX0iCiAgICApCgoKIyBkZWZhdWx0IGNvbmZpZ3MKIyB3aWxsIGJlIHVzZWQgaWYgdXNlciBwcm92aWRlcyAiVHJ1ZSIgd2l0aCBjb25maWcgbmFtZSBhcyBpbnB1dApRVUFOVElaQVRJT05fQ09ORklHID0gdHJhbnNmb3JtZXJzLkJpdHNBbmRCeXRlc0NvbmZpZygKICAgIGxvYWRfaW5fNGJpdD1UcnVlLAogICAgYm5iXzRiaXRfdXNlX2RvdWJsZV9xdWFudD1UcnVlLAogICAgYm5iXzRiaXRfcXVhbnRfdHlwZT0ibmY0IiwKICAgIGJuYl80Yml0X2NvbXB1dGVfZHR5cGU9dG9yY2guYmZsb2F0MTYsCikKClBFRlRfQ09ORklHID0gcGVmdC5Mb3JhQ29uZmlnKAogICAgcj04LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsicV9wcm9qIiwgImtfcHJvaiIsICJ2X3Byb2oiLCAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwgInVwX3Byb2oiLCAiZG93bl9wcm9qIl0sCiAgICBsb3JhX2Ryb3BvdXQ9MC4wNSwKICAgIGJpYXM9Im5vbmUiLAogICAgdGFza190eXBlPSJDQVVTQUxfTE0iLAopCgpERUVQU1BFRURfQ09ORklHID0gewogICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICJhdXRvIiwKICAgICJmcDE2IjogeyJlbmFibGVkIjogVHJ1ZX0sCiAgICAiYXV0b3R1bmluZyI6IHsKICAgICAgICAiZW5hYmxlZCI6IFRydWUsCiAgICAgICAgImFyZ19tYXBwaW5ncyI6IHsKICAgICAgICAgICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICItLXBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSIsCiAgICAgICAgICAgICJncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMgIjogIi0tZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzIiwKICAgICAgICB9LAogICAgfSwKICAgICJ6ZXJvX29wdGltaXphdGlvbiI6IHsKICAgICAgICAic3RhZ2UiOiAyLAogICAgfSwKfQoKCmRlZiBfdXBkYXRlX2NvbmZpZyhzcmM6IGRpY3QsIGRzdDogZGljdCk6CiAgICAiIiIKICAgIHVwZGF0ZSBjb25maWdzIGFjY29yZGluZyB0byB1c2VyLCB0aGlzIHdheSB0aGUgdXNlciBjYW4gYWRkL21vZGlmeSB2YWx1ZXMgaW4gZGVmYXVsdCBjb25maWdzIGZvciBlLmcuCgogICAgZ29lcyBvdmVyIGFsbCBjb25maWdzIGFuZCBjb3JyZXNwb25kaW5nIHByZWZpeGVzLCBjb2xsZWN0IGFsbCB0aGUga2V5cyBmcm9tIHRoZSBnaXZlbiBkaWN0IHRoYXQgc3RhcnQKICAgICB3aXRoIHRoZSBwcmVmaXggYW5kIGFkZCB0aGVtIHRvIGFwcHJvcHJpYXRlIGNvbmZpZwoKICAgIDpwYXJhbSBzcmM6IGRpY3Qgb2YgYWxsIGNhbmRpZGF0ZSB2YWx1ZXMgdG8gdXBkYXRlIGRpY3QuCiAgICA6cGFyYW0gZHN0OiBkaWN0IGNvbnRhaW5pbmcgYWxsIGNvbmZpZ3MgdG8gdXBkYXRlLgogICAgIiIiCgogICAgZm9yIGNvbmZpZ19uYW1lLCBjb25maWcgaW4gZHN0Lml0ZW1zKCk6CgogICAgICAgICMgSWYgZ2l2ZW4gVHJ1ZSB3ZSB1c2UgZGVmYXVsdCBkaWN0CiAgICAgICAgIyBDYW4gYWxzbyBiZSBGYWxzZSBvciBhIGNvbmZpZyBkaWN0IGdpdmVuIGZyb20gdXNlciwgc28gd2UgY2hlY2sgc3BlY2lmaWNhbGx5IGZvIFRydWUKICAgICAgICBpZiBjb25maWcgaXMgVHJ1ZSBhbmQgY29uZmlnX25hbWUgPT0gInF1YW50aXphdGlvbiI6CiAgICAgICAgICAgIGNvbmZpZyA9IFFVQU5USVpBVElPTl9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJsb3JhIjoKICAgICAgICAgICAgY29uZmlnID0gUEVGVF9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJkZWVwc3BlZWQiOgogICAgICAgICAgICBjb25maWcgPSBERUVQU1BFRURfQ09ORklHCgogICAgICAgICMgaW4gc29tZSBjYXNlcyB3ZSBjYW4gZ2V0IGEgYm9vbGVhbiB2YWx1ZSwgaW4gdGhhdCBjYXNlIG5vIG5lZWQgdG8gbG9vayBmb3IgYXJncwogICAgICAgIGlmIGlzaW5zdGFuY2UoY29uZmlnLCBib29sKToKICAgICAgICAgICAgY29uZmlnID0gTm9uZQoKICAgICAgICBlbGlmIGlzaW5zdGFuY2UoY29uZmlnLCBkaWN0KToKICAgICAgICAgICAgZm9yIGtleSwgdmFsIGluIHNyYy5pdGVtcygpOgogICAgICAgICAgICAgICAgaWYga2V5LnN0YXJ0c3dpdGgoY29uZmlnX25hbWUpOgogICAgICAgICAgICAgICAgICAgIGNvbmZpZ1trZXkucmVwbGFjZShmIntjb25maWdfbmFtZX1fIiwgIiIpXSA9IHZhbAoKICAgICAgICAjIHVwZGF0ZSBieSBjb25maWcgbmFtZQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBzZXRhdHRyKGNvbmZpZywga2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKSwgdmFsKQoKICAgICAgICBkc3QudXBkYXRlKHtjb25maWdfbmFtZTogY29uZmlnfSkKCgpkZWYgX2dldF9jbGFzc19vYmplY3QoY2xhc3NfcGF0aDogc3RyKSAtPiB0eXBlOgogICAgIiIiCiAgICBnaXZlbiBhIGZ1bGwgY2xhc3MgbmFtZSwgdGhpcyBmdW5jdGlvbiByZXR1cm5zIHRoZSBjb3JyZWN0IGNsYXNzCgogICAgOnBhcmFtIGNsYXNzX3BhdGg6IGEgZnVsbCBjbGFzcyBuYW1lIChleC4gJ3RyYW5zZm9ybWVycy5BdXRvTW9kZWxGb3JDYXVzYWxMTScpCgogICAgOnJldHVybiB0aGUgd2FudGVkIGNsYXNzIG9iamVjdAogICAgIiIiCiAgICBtb2R1bGVfcGF0aCwgY2xhc3NfbmFtZSA9IGNsYXNzX3BhdGgucnNwbGl0KCIuIiwgMSkKICAgIG1vZHVsZSA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKG1vZHVsZV9wYXRoKQogICAgcmV0dXJuIGdldGF0dHIobW9kdWxlLCBjbGFzc19uYW1lKQoKCmRlZiBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICBtb2RlbDogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0sCiAgICB0YXNrOiBzdHIsCiAgICBmcmFtZXdvcms6IHN0ciwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IGRpY3QsCiAgICB1c2VfY3VkYTogYm9vbCwKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgIG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgZGV2aWNlX21hcDogc3RyLAopOgogICAgIiIiCiAgICBnZXQgdGhlIGNvcnJlY3QgbW9kZWwgYW5kIHRva2VuaXplciBhY2NvcmRpbmcgdG8gZ2l2ZW4gdXNlciBpbnB1dHMKCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdGFzazogYSBzdXBwb3J0ZWQgbmxwIHRhc2ssIHVzZWQgdG8gY2hvb3NlIG1vZGVsIGlmIG5vdCBwcm92aWRlZAogICAgOnBhcmFtIGZyYW1ld29yazogcHQgb3IgdGYKICAgIDpwYXJhbSBxdWFudGl6YXRpb25fY29uZmlnOiBxdWFudGl6YXRpb24gY29uZmlnIG9yIE5vbmUsIHRvIGxvYWQgbW9kZWwgaW4gYXBwcm9wcmlhdGUgd2F5CiAgICA6cGFyYW0gdXNlX2N1ZGE6IHVzZSBncHUgb3Igbm90CiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkZXZpY2VfbWFwOiBhIGRldmljZSBtYXAgZm9yIG1vZGVsIHRyYWluaW5nIGlmIHVzaW5nIG51bWJlciBvZiBncHUncwoKICAgIDpyZXR1cm5zOiBtb2RlbCBhbmQgdG9rZW5pemVyCiAgICAiIiIKICAgICMgaWYgdGFzayBpcyBub3Qgc3VwcG9ydGVkIGFuZCBubyBtb2RlbCB3YXMgZ2l2ZW4gd2UgY2FuJ3QgY2hvb3NlIG9uZQogICAgaWYgdGFzayBhbmQgdGFzayBub3QgaW4gc3VwcG9ydGVkX3Rhc2tzIGFuZCBub3QgbW9kZWw6CiAgICAgICAgbG9nZ2VyLmVycm9yKCJ1bnN1cHBvcnRlZCB0YXNrIG9wdGlvbiBjaG9zZW4iKQogICAgICAgIHJhaXNlCgogICAgIyBsb2FkIG1vZGVsIGZyb20gc3RvcmUKICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cikgYW5kIGlzX3N0b3JlX3VyaShtb2RlbCk6CiAgICAgICAgcGFzcwogICAgICAgICMgVE9ETzogbG9hZCBib3RoIG1vZGVsIGFuZCB0b2tlbml6ZXIgYW5kIHJldHVybiwgbmVlZCBndXkncyBoZWxwCgogICAgIyBpZiBpdCdzIGEgdHVwbGUgdGhlbSB3ZSBhc3N1bWUgaXQgY29udGFpbnMgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgbGlzdCk6CiAgICAgICAgbW9kZWxfbmFtZSwgbW9kZWxfY2xhc3MgPSBtb2RlbAogICAgICAgIG1vZGVsX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QobW9kZWxfY2xhc3MpCgogICAgIyBpbiB0aGUgY2FzZSB3ZSBkb24ndCBnZXQgdGhlIG1vZGVsIGNsYXNzIHdlIG5lZWQgdGhlIHRhc2sgaW4gb3JkZXIgdG8gY2hvb3NlIHRoZSBjb3JyZWN0IG1vZGVsCiAgICBlbHNlOgogICAgICAgIGlmIHRhc2sgaXMgTm9uZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0YXNrIG11c3QgYmUgY2hvc2VuIGluIG9yZGVyIHRvIGRldGVybWluZSB0aGUgY29ycmVjdCBtb2RlbCIpCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBfLCBhdmFpbGFibGVfY2xhc3NlcywgdGFza19vcHRpb25zID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5jaGVja190YXNrKHRhc2spCgogICAgICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cik6CiAgICAgICAgICAgIG1vZGVsX25hbWUgPSBtb2RlbAoKICAgICAgICAjIGlmIG1vZGVsIGlzIG5vdCBnaXZlbiwgd2UgdGFrZSB0aGUgZGVmYXVsdCBtb2RlbCBmb3IgdGhlIGdpdmVuIHRhc2sKICAgICAgICBlbHNlOgogICAgICAgICAgICBtb2RlbF9uYW1lLCBfID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5nZXRfZGVmYXVsdF9tb2RlbF9hbmRfcmV2aXNpb24oCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfY2xhc3NlcywgZnJhbWV3b3JrLCB0YXNrX29wdGlvbnMKICAgICAgICAgICAgKQogICAgICAgIGlmIG5vdCBhdmFpbGFibGVfY2xhc3Nlcy5nZXQoZnJhbWV3b3JrLCB0dXBsZSgpKToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKAogICAgICAgICAgICAgICAgImdpdmVuIHRhc2sncyBkZWZhdWx0IG1vZGVsIGlzIG5vdCBzdXBwb3J0ZWQgaW4gc3BlY2lmaWVkIGZyYW1ld29yayIKICAgICAgICAgICAgKQogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oCiAgICAgICAgICAgICAgICAidGhpcyBmdW5jdGlvbiByZXF1aXJlcyBlaXRoZXIgYSBzdXBwb3J0ZWQgdGFzayBvciBhIG1vZGVsIGFuZCBtb2RlbCBjbGFzcyB0byBiZSBjaG9zZW4iCiAgICAgICAgICAgICkKCiAgICAgICAgbW9kZWxfY2xhc3MgPSBhdmFpbGFibGVfY2xhc3Nlc1tmcmFtZXdvcmtdWzBdCgogICAgIyBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICBpZiB1c2VfY3VkYToKICAgICAgICBkZXZpY2VfbWFwID0gZGV2aWNlX21hcAogICAgZWxzZToKICAgICAgICBkZXZpY2VfbWFwID0gTm9uZQoKICAgIG1vZGVsID0gbW9kZWxfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsCiAgICAgICAgcXVhbnRpemF0aW9uX2NvbmZpZz1xdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICAgICAqKm1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgKQoKICAgICMgSWYgcXVhbnRpemF0aW9uIGNvbmZpZyBpcyBnaXZlbiB3ZSB3aWxsIGxvYWQgYSBxdWFudGl6ZWQgbW9kZWwsIGlmIG5vdCBhIHJlZ3VsYXIgb25lCiAgICBpZiBxdWFudGl6YXRpb25fY29uZmlnOgogICAgICAgIG1vZGVsLmdyYWRpZW50X2NoZWNrcG9pbnRpbmdfZW5hYmxlKCkKICAgICAgICBtb2RlbCA9IHBlZnQucHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZyhtb2RlbCkKCiAgICAjIGlmIG5vdCBzcGVjaWZpZWQgd2UgY2hvb3NlIHRoZSBkZWZhdWx0IHRva2VuaXplciB0aGF0IGNvcnJlc3BvbmRpbmcgdG8gdGhlIG1vZGVsCiAgICBpZiB0b2tlbml6ZXIgaXMgTm9uZToKICAgICAgICB0b2tlbml6ZXIgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQobW9kZWxfbmFtZSkKICAgICAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKICAgIGlmIGlzaW5zdGFuY2UodG9rZW5pemVyLCBzdHIpOgogICAgICAgIHRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gdHJhbnNmb3JtZXJzLkF1dG9Ub2tlbml6ZXIKCiAgICAjIGlmIGl0J3Mgbm90IGEgc3RyIHRoZW4gaXQncyBhIHR1cGxlIG9mIGJvdGggbmFtZSBhbmQgY2xhc3MKICAgIGVsc2U6CiAgICAgICAgdG9rZW5pemVyX25hbWUsIHRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcgogICAgICAgIHRva2VuaXplcl9jbGFzcyA9IF9nZXRfY2xhc3Nfb2JqZWN0KHRva2VuaXplcl9jbGFzcykKCiAgICB0b2tlbml6ZXIgPSB0b2tlbml6ZXJfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIHRva2VuaXplcl9uYW1lLCAqKnRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZwogICAgKQoKICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgcmV0dXJuIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIKCgpkZWYgX2RhdGFzZXRfbG9hZGVyKGRhdGFzZXQ6IHN0ciwgaXNfdHJhaW46IGJvb2wgPSBUcnVlLCAqKmt3YXJncykgLT4gRGF0YXNldDoKICAgICIiIgogICAgbG9hZHMgdGhlIHNwZWNpZmljIGRhdGFzZXQgcHJvdmlkZWQgYnkgdGhlIHVzZXIKCiAgICA6cGFyYW0gZGF0YXNldDogbmFtZSBvciBwYXRoIG9mIGRhdGFzZXQgdG8gbG9hZAogICAgOnBhcmFtIGlzX3RyYWluOiBib29sIHRoYXQgaW5kaWNhdGVzIHRoZSBwdXJwb3NlIG9mIHRoZSBkYXRhc2V0CiAgICA6cGFyYW0ga3dhcmdzOiBvdGhlciBrd2FyZ3MgZm9yIGxvYWRpbmcgdGhlIGRhdGFzZXQKCiAgICA6cmV0dXJuczogbG9hZGVkIGRhdGFzZXQKICAgICIiIgogICAgIyBpZiBzcGxpdCBpbiBrd2FyZ3MgdGhlbiB0aGUgdXNlciBkZWNpZGVzIGhvdyB0byBzcGxpdCB0aGUgZGF0YXNldAogICAgaWYgInNwbGl0IiBpbiBrd2FyZ3M6CiAgICAgICAgcmV0dXJuIGxvYWRfZGF0YXNldChkYXRhc2V0LCAqKmt3YXJncykKCiAgICAjIGlmIGl0J3MgYSBkYXRhc2V0IGZvciB0cmFpbiB3ZSBzcGxpdCB3aXRoIHRyYWluCiAgICBpZiBpc190cmFpbjoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsIHNwbGl0PSJ0cmFpbiIsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBldmFsIGRhdGFzZXQsIHRoZW4gYSBsb3Qgb2YgbmFtZXMgYXJlIGFjY2VwdGFibGUgZm9yIHRoZSBzZXQgYW5kIHdlIGNoZWNrIGFsbCBvZiB0aGVtCiAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQogICAgaWYgInRlc3QiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgIGVsaWYgImV2YWwiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJldmFsIikKICAgIGVsaWYgInZhbGlkYXRpb24iIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIHRva2VuaXplciwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSwKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogd2hpY2ggY29sdW1ucyB0byBwYXNzIHRvIHRoZSBtb2RlbCBhcyBpbnB1dHMKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIChuZWVkIHRvIHBhc3MgdGhyb3VnaCB0aGUgdG9rZW5pemVyIGZpcnN0KQogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRva2VuaXplcjogdGhlIHRva2VuaXplciB0byBwYXNzIHRoZSBkYXRhIHRocm91Z2gKCiAgICA6cmV0dXJuczogdG9rZW5pemVkIGRhdGFzZXRzCiAgICAiIiIKICAgIGlmIG5vdCB0b2tlbml6ZXIucGFkX3Rva2VuOgogICAgICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgIyB3ZSB0YWtlIGNvbCBuYW1lL3MgaW4gYSBsaXN0IGZvciBlYXN5IGdlbmVyYWxpemF0aW9uCiAgICBpZiBpc2luc3RhbmNlKGRhdGFzZXRfY29sdW1uc190b190cmFpbiwgc3RyKToKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW4gPSBbZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXQoKICAgIGlmIGlzaW5zdGFuY2UodHJhaW5fZGF0YXNldCwgbWxydW4uZGF0YXN0b3JlLkRhdGFJdGVtKToKICAgICAgICB0cmFpbl9kYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyh0cmFpbl9kYXRhc2V0LmFzX2RmKCkpCiAgICAgICAgcmV0dXJuICgKICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICBsYW1iZGEgZXhhbXBsZXM6IHRva2VuaXplcigKICAgICAgICAgICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICBwYWRkaW5nPVRydWUsCiAgICAgICAgICAgICAgICApLAogICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICApLAogICAgICAgICAgICBOb25lLAogICAgICAgICkKCiAgICAjIExvYWQgZGF0YXNldHMKICAgICMgaWYgcHJvdmlkZWQgdHdvIHBhdGhzL25hbWVzIHdlIGxvYWQgZWFjaCBzZXBhcmF0ZWx5IHVzaW5nIGRlc2lnbmF0ZWQgZnVuYwogICAgaWYgZXZhbF9kYXRhc2V0OgogICAgICAgIHRyYWluX2RhdGFzZXQgPSBfZGF0YXNldF9sb2FkZXIoCiAgICAgICAgICAgIGRhdGFzZXQ9dHJhaW5fZGF0YXNldCwgaXNfdHJhaW49VHJ1ZSwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgICAgIGV2YWxfZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD1ldmFsX2RhdGFzZXQsIGlzX3RyYWluPUZhbHNlLCAqKmV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncwogICAgICAgICkKCiAgICAjIGlmIG9ubHkgb24gcGF0aCBpcyBnaXZlbiB0aGVuIHdlIG11c3QgY2hlY2sgaWYgaXQgY29udGFpbnMgYm90aCBkYXRhc2V0IG9yIGlmIG9ubHkgb25lIHNob3VsZCBiZSB1c2VkCiAgICBlbHNlOgogICAgICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQodHJhaW5fZGF0YXNldCwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzKQogICAgICAgIGlmICJ0cmFpbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgdHJhaW5fZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0cmFpbiIpCiAgICAgICAgICAgIGlmICJ0ZXN0IiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInRlc3QiKQogICAgICAgICAgICBlbGlmICJldmFsIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoImV2YWwiKQogICAgICAgICAgICBlbGlmICJ2YWxpZGF0aW9uIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgIyBvbmx5IHRyYWluIGRhdGFzZXQgZ2l2ZW4sIHRva2VuaXplIGFuZCByZXR1cm4gaXQKICAgICAgICAgICAgICAgIHJldHVybiAoCiAgICAgICAgICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICAgICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAgICAgICAgICAgICAgICAgKltleGFtcGxlc1tjb2xdIGZvciBjb2wgaW4gZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgKSwKICAgICAgICAgICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICAgICAgICAgICksCiAgICAgICAgICAgICAgICAgICAgTm9uZSwKICAgICAgICAgICAgICAgICkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRyYWluIGRhdGFzZXQgaXMgbWFuZGF0b3J5IikKICAgICAgICAgICAgcmFpc2UgS2V5RXJyb3IoIm5vIHRyYWluIGRhdGFzZXQgZm91bmQgaW4gZ2l2ZW4gZGF0YXNldCIpCgogICAgIyBUb2tlbml6ZSB0aGUgZGF0YSBzbyB0aGUgbW9kZWwgY2FuIHVuZGVyc3RhbmQgaXQKICAgIHRva2VuaXplZF90cmFpbl9kYXRhc2V0ID0gdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgbGFtYmRhIGV4YW1wbGVzOiB0b2tlbml6ZXIoCiAgICAgICAgICAgICpbZXhhbXBsZXNbY29sXSBmb3IgY29sIGluIGRhdGFzZXRfY29sdW1uc190b190cmFpbl0sCiAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgcGFkZGluZz1UcnVlLAogICAgICAgICksCiAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgKQoKICAgIHRva2VuaXplZF9ldmFsX2RhdGFzZXQgPSBldmFsX2RhdGFzZXQubWFwKAogICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICB0cnVuY2F0aW9uPVRydWUsCiAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICApLAogICAgICAgIGJhdGNoZWQ9VHJ1ZSwKICAgICkKCiAgICByZXR1cm4gdG9rZW5pemVkX3RyYWluX2RhdGFzZXQsIHRva2VuaXplZF9ldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSA9ICJ0ZXh0IiwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRmluZS10dW5lcyBhIExhbmd1YWdlIE1vZGVsIChMTE0pIG9uIGEgc3BlY2lmaWMgdGFzayB1c2luZyB0aGUgcHJvdmlkZWQgZGF0YXNldC4KICAgICBUaGUgZnVuY3Rpb24gdGFrZXMgdmFyaW91cyBjb25maWd1cmF0aW9uIHBhcmFtZXRlcnMgdG8gY3VzdG9taXplIHRoZSB0cmFpbmluZyBwcm9jZXNzCiAgICAgYW5kIGFkYXB0IHRoZSBtb2RlbCB0byBzcGVjaWZpYyB0YXNrcyB1c2luZyBhIHByb3ZpZGVkIGRhdGFzZXQuCgogICAgOnBhcmFtIGNvbnRleHQ6IG1scnVuIGNvbnRleHQgaW4gb3JkZXIgdG8gbG9nIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW46IHdoaWNoIGNvbHVtbnMgdG8gcGFzcyB0byB0aGUgbW9kZWwgYXMgaW5wdXRzCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZnJhbWV3b3JrOiBwdCBvdCB0ZgogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdHJhaW5fZGF0YXNldDogVGhlIHRyYWluIGRhdGFzZXQgdXNlZCBmb3IgZmluZS10dW5pbmcgdGhlIGxhbmd1YWdlIG1vZGVsLgogICAgOnBhcmFtIGV2YWxfZGF0YXNldDogVGhlIGV2YWwgZGF0YXNldCB1c2VkIGZvciBldmFsdWF0ZSB0aGUgbGFuZ3VhZ2UgbW9kZWwgZHVyaW5nIHRyYWluaW5nLgogICAgOnBhcmFtIGRlZXBzcGVlZF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgRGVlcFNwZWVkIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBtb2RlbCBxdWFudGl6YXRpb24gKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBsb3JhX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBMb3ctUmFuayBBcHByb3hpbWF0aW9uIChMb1JBKSAob3B0aW9uYWwpLgogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gZGF0YV9jb2xsYXRvcl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgZGF0YSBjb2xsYXRpb24gZHVyaW5nIHRyYWluaW5nIChvcHRpb25hbCkuCiAgICA6cGFyYW0gdGFzazogQSBkZXNjcmlwdGlvbiBvZiB0aGUgc3BlY2lmaWMgdGFzayB0aGUgbW9kZWwgaXMgYmVpbmcgZmluZS10dW5lZCBmb3IuCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBUT0RPOiBtYXRjaCBmb3J3YXJkLmtleXdvcmQgdG8gZGF0YXNldC5rZXl3b3JkIC0gY2hlY2sgaWYgcmVsZXZhbnQgaW4gbmV3IGRlc2lnbgogICAgIyBUT0RPOiBhZGQgd2FybmluZyBmb3IgbGFiZWwsIGFuZCBhZGQgb3B0aW9uIHRvIG1vZGlmeSBkYXRhc2V0IGNvbCBuYW1lcyAtIGNoZWNrIGlmIHJlbGV2YW50IGluIG5ldyBkZXNpZ24KCiAgICAjIExvb2sgZm9yIHVwZGF0ZXMgdG8gY29uZmlncyBnaXZlbiBpbiBrd2FyZ3MKICAgIGNvbmZpZ3MgPSB7CiAgICAgICAgQ29uZmlnS2V5cy5kZWVwc3BlZWQ6IGRlZXBzcGVlZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5xdWFudGl6YXRpb246IHF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50cmFpbmluZzogdHJhaW5pbmdfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZDogbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50b2tlbml6ZXJfcHJldHJhaW5lZDogdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuZGF0YV9jb2xsYXRvcjogZGF0YV9jb2xsYXRvcl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZzogcGVmdF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5iZXRhOiBiZXRhLAogICAgfQogICAgX3VwZGF0ZV9jb25maWcoZHN0PWNvbmZpZ3MsIHNyYz1rd2FyZ3MpCgogICAgIyBjaGVjayBncHUgcGVybWlzc2lvbiBhbmQgYXZhaWxhYmlsaXR5CiAgICBpZiB1c2VfY3VkYToKICAgICAgICBpZiB0b3JjaC5jdWRhLmlzX2F2YWlsYWJsZSgpOgogICAgICAgICAgICAjIENsZWFuIGdwdSBjYWNoZQogICAgICAgICAgICB0b3JjaC5jdWRhLmVtcHR5X2NhY2hlKCkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIud2FybmluZygiJ3VzZV9jdWRhJyBpcyBzZXQgdG8gVHJ1ZSwgYnV0IG5vIGN1ZGEgZGV2aWNlIGlzIGF2YWlsYWJsZSIpCgogICAgIyBnZXQgbW9kZWwgYW5kIHRva2VuaXplcgogICAgbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplciA9IF9zZXRfbW9kZWxfYW5kX3Rva2VuaXplcigKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHF1YW50aXphdGlvbl9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnF1YW50aXphdGlvbl0sCiAgICAgICAgdXNlX2N1ZGE9dXNlX2N1ZGEsCiAgICAgICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnPXRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZz1jb25maWdzW0NvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZF0sCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgKQoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgdG9rZW5pemVkX3RyYWluLCB0b2tlbml6ZWRfZXZhbCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncz10cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLAogICAgICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncz1ldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW49ZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluLAogICAgKQoKICAgICMgSW5pdGlhbGl6ZSB0aGUgZGF0YSBjb2xsYXRvciBmb3IgdGhlIHRyYWluZXIgdG8gdXNlIGluIG9yZGVyIHRvIGNyZWF0ZSBiYXRjaGVzIG9mIGRhdGEKICAgIGRhdGFfY29sbGF0b3IgPSB0cmFuc2Zvcm1lcnMuRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZygKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLCBtbG09RmFsc2UsICoqZGF0YV9jb2xsYXRvcl9jb25maWcKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IHRybC5EUE9UcmFpbmVyKAogICAgICAgIG1vZGVsPW1vZGVsLAogICAgICAgIHJlZl9tb2RlbCA9IE5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10b2tlbml6ZWRfdHJhaW4sCiAgICAgICAgZXZhbF9kYXRhc2V0PXRva2VuaXplZF9ldmFsLAogICAgICAgIHBlZnRfY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZ10sCiAgICAgICAgYmV0YSA9IGNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGRhdGFfY29sbGF0b3I9ZGF0YV9jb2xsYXRvciwKICAgICAgICBhcmdzPXRyYWluaW5nX2FyZ3MsCiAgICApCgogICAgYXBwbHlfbWxydW4odHJhaW5lciwgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdKQogICAgbW9kZWwuY29uZmlnLnVzZV9jYWNoZSA9ICgKICAgICAgICBGYWxzZSAgIyBzaWxlbmNlIHRoZSB3YXJuaW5ncy4gUGxlYXNlIHJlLWVuYWJsZSBmb3IgaW5mZXJlbmNlIQogICAgKQoKICAgICMgQXBwbHkgdHJhaW5pbmcgd2l0aCBldmFsdWF0aW9uOgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmInRyYWluaW5nICd7bW9kZWxfbmFtZX0nIikKICAgIHRyYWluZXIudHJhaW4oKQoKICAgIHRlbXBfZGlyZWN0b3J5ID0gdGVtcGZpbGUuVGVtcG9yYXJ5RGlyZWN0b3J5KCkubmFtZQogICAgdHJhaW5lci5zYXZlX21vZGVsKHRlbXBfZGlyZWN0b3J5KQoKICAgICMgWmlwIHRoZSBtb2RlbCBkaXJlY3Rvcnk6CiAgICBzaHV0aWwubWFrZV9hcmNoaXZlKAogICAgICAgIGJhc2VfbmFtZT0ibW9kZWwiLAogICAgICAgIGZvcm1hdD0iemlwIiwKICAgICAgICByb290X2Rpcj10ZW1wX2RpcmVjdG9yeSwKICAgICkKCiAgICAjIExvZyB0aGUgbW9kZWw6CiAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICBrZXk9Im1vZGVsIiwKICAgICAgICBkYl9rZXk9bW9kZWxfbmFtZS5zcGxpdCgiLyIpWy0xXSwKICAgICAgICBtb2RlbF9maWxlPSJtb2RlbC56aXAiLAogICAgICAgIHRhZz0iIiwKICAgICAgICBmcmFtZXdvcms9Ikh1Z2dpbmcgRmFjZSIsCiAgICApCgoKZGVmIGV2YWx1YXRlKAogICAgY29udGV4dCwKICAgIG1vZGVsX3BhdGgsCiAgICBkYXRhOiBwZC5EYXRhRnJhbWUsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgdG9rZW5pemVyX25hbWU6IHN0ciA9IE5vbmUsCik6CiAgICAiIiIKICAgIEV2YWx1YXRpbmcgdGhlIG1vZGVsIHVzaW5nIHBlcnBsZXhpdHksIGZvciBtb3JlIGluZm9ybWF0aW9uIHZpc2l0OgogICAgaHR0cHM6Ly9odWdnaW5nZmFjZS5jby9kb2NzL3RyYW5zZm9ybWVycy9wZXJwbGV4aXR5CgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICBtbHJ1biBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGF0aDogIHBhdGggdG8gdGhlIG1vZGVsIGRpcmVjdG9yeQogICAgOnBhcmFtIGRhdGE6ICAgICAgICB0aGUgZGF0YSB0byBldmFsdWF0ZSB0aGUgbW9kZWwKICAgIDpwYXJhbSBtb2RlbF9uYW1lOiAgbmFtZSBvZiBiYXNlIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6IG5hbWUgb2YgYmFzZSB0b2tlbml6ZXIKICAgICIiIgogICAgIyBHZXQgdGhlIG1vZGVsIGFydGlmYWN0IGFuZCBmaWxlOgogICAgKAogICAgICAgIG1vZGVsX2ZpbGUsCiAgICAgICAgbW9kZWxfYXJ0aWZhY3QsCiAgICAgICAgZXh0cmFfZGF0YSwKICAgICkgPSBtbHJ1bi5hcnRpZmFjdHMuZ2V0X21vZGVsKG1vZGVsX3BhdGgpCgogICAgIyBSZWFkIHRoZSBuYW1lOgogICAgX21vZGVsX25hbWUgPSBtb2RlbF9hcnRpZmFjdC5zcGVjLmRiX2tleQoKICAgICMgRXh0cmFjdCBsb2dnZWQgbW9kZWwgZmlsZXM6CiAgICBtb2RlbF9kaXJlY3RvcnkgPSBvcy5wYXRoLmpvaW4ob3MucGF0aC5kaXJuYW1lKG1vZGVsX2ZpbGUpLCBfbW9kZWxfbmFtZSkKICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX2ZpbGU6CiAgICAgICAgemlwX2ZpbGUuZXh0cmFjdGFsbChtb2RlbF9kaXJlY3RvcnkpCgogICAgIyBMb2FkaW5nIHRoZSBzYXZlZCBwcmV0cmFpbmVkIHRva2VuaXplciBhbmQgbW9kZWw6CiAgICBkYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyhkYXRhKQogICAgdG9rZW5pemVyID0gQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQodG9rZW5pemVyX25hbWUpCiAgICBwYWRfdG9rZW5faWQgPSB0b2tlbml6ZXIuZW9zX3Rva2VuX2lkCiAgICBtb2RlbCA9IEF1dG9Nb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCgKICAgICAgICBtb2RlbF9uYW1lLCBkZXZpY2VfbWFwPSJjdWRhOjAiLCB0cnVzdF9yZW1vdGVfY29kZT1UcnVlLCBsb2FkX2luXzhiaXQ9VHJ1ZQogICAgKQogICAgbW9kZWwgPSBQZWZ0TW9kZWwuZnJvbV9wcmV0cmFpbmVkKG1vZGVsLCBtb2RlbF9kaXJlY3RvcnkpCiAgICBtb2RlbC5ldmFsKCkKICAgIGVuY29kaW5ncyA9IHRva2VuaXplcigiXG5cbiIuam9pbihkYXRhc2V0WyJ0ZXh0Il1bOjVdKSwgcmV0dXJuX3RlbnNvcnM9InB0IikKCiAgICBtYXhfbGVuZ3RoID0gMTAyNAogICAgc3RyaWRlID0gNTEyCiAgICBzZXFfbGVuID0gZW5jb2RpbmdzLmlucHV0X2lkcy5zaXplKDEpCgogICAgbmxscyA9IFtdCiAgICBwcmV2X2VuZF9sb2MgPSAwCiAgICBmb3IgYmVnaW5fbG9jIGluIHJhbmdlKDAsIHNlcV9sZW4sIHN0cmlkZSk6CiAgICAgICAgZW5kX2xvYyA9IG1pbihiZWdpbl9sb2MgKyBtYXhfbGVuZ3RoLCBzZXFfbGVuKQogICAgICAgIHRyZ19sZW4gPSBlbmRfbG9jIC0gcHJldl9lbmRfbG9jICAjIG1heSBiZSBkaWZmZXJlbnQgZnJvbSBzdHJpZGUgb24gbGFzdCBsb29wCiAgICAgICAgaW5wdXRfaWRzID0gZW5jb2RpbmdzLmlucHV0X2lkc1s6LCBiZWdpbl9sb2M6ZW5kX2xvY10KICAgICAgICB0YXJnZXRfaWRzID0gaW5wdXRfaWRzLmNsb25lKCkKICAgICAgICB0YXJnZXRfaWRzWzosIDotdHJnX2xlbl0gPSAtMTAwCgogICAgICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgICAgICBvdXRwdXRzID0gbW9kZWwoaW5wdXRfaWRzLmN1ZGEoKSwgbGFiZWxzPXRhcmdldF9pZHMpCgogICAgICAgICAgICAjIGxvc3MgaXMgY2FsY3VsYXRlZCB1c2luZyBDcm9zc0VudHJvcHlMb3NzIHdoaWNoIGF2ZXJhZ2VzIG92ZXIgdmFsaWQgbGFiZWxzCiAgICAgICAgICAgICMgTi5CLiB0aGUgbW9kZWwgb25seSBjYWxjdWxhdGVzIGxvc3Mgb3ZlciB0cmdfbGVuIC0gMSBsYWJlbHMsIGJlY2F1c2UgaXQgaW50ZXJuYWxseSBzaGlmdHMgdGhlIGxhYmVscwogICAgICAgICAgICAjIHRvIHRoZSBsZWZ0IGJ5IDEuCiAgICAgICAgICAgIG5lZ19sb2dfbGlrZWxpaG9vZCA9IG91dHB1dHMubG9zcwoKICAgICAgICBubGxzLmFwcGVuZChuZWdfbG9nX2xpa2VsaWhvb2QpCgogICAgICAgIHByZXZfZW5kX2xvYyA9IGVuZF9sb2MKICAgICAgICBpZiBlbmRfbG9jID09IHNlcV9sZW46CiAgICAgICAgICAgIGJyZWFrCgogICAgcHBsID0gdG9yY2guZXhwKHRvcmNoLnN0YWNrKG5sbHMpLm1lYW4oKSkuaXRlbSgpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoInBlcnBsZXhpdHkiLCBwcGwpCg==
+    commands: []
+    code_origin: ''
+    origin_filename: ''
+    requirements: []
+  entry_points:
+    add_interface:
+      name: add_interface
+      doc: ''
+      parameters:
+      - name: cls
+        default: ''
+      - name: obj
+        type: DPOTrainer
+        default: ''
+      - name: restoration
+        type: MLRunInterfaceRestorationType
+        default: null
+      outputs:
+      - default: ''
+      lineno: 72
+    mlrun_train:
+      name: mlrun_train
+      doc: ''
+      parameters:
+      - name: cls
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 82
+    wrapper:
+      name: wrapper
+      doc: ''
+      parameters:
+      - name: self
+        type: DPOTrainer
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 83
+    on_epoch_begin:
+      name: on_epoch_begin
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 131
+    on_epoch_end:
+      name: on_epoch_end
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 142
+    on_log:
+      name: on_log
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      - name: logs
+        type: Dict[str, float]
+        default: null
+      outputs:
+      - default: ''
+      lineno: 153
+    on_train_begin:
+      name: on_train_begin
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 179
+    on_train_end:
+      name: on_train_end
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      - name: model
+        type: PreTrainedModel
+        default: null
+      - name: tokenizer
+        type: PreTrainedTokenizer
+        default: null
+      outputs:
+      - default: ''
+      lineno: 190
+    on_evaluate:
+      name: on_evaluate
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: args
+        type: TrainingArguments
+        default: ''
+      - name: state
+        type: TrainerState
+        default: ''
+      - name: control
+        type: TrainerControl
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 203
+    log_metrics:
+      name: log_metrics
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 217
+    log_metric_plot:
+      name: log_metric_plot
+      doc: ''
+      parameters:
+      - name: self
+        default: ''
+      - name: name
+        type: str
+        default: ''
+      - name: scores
+        type: List[float]
+        default: ''
+      outputs:
+      - default: ''
+      lineno: 224
+    apply_mlrun:
+      name: apply_mlrun
+      doc: This is temporary and will be built in mlrun 1.5.0
+      parameters:
+      - name: trainer
+        type: DPOTrainer
+        default: ''
+      - name: model_name
+        type: str
+        default: null
+      - name: tag
+        type: str
+        default: ''
+      - name: context
+        type: MLClientCtx
+        default: null
+      - name: auto_log
+        type: bool
+        default: true
+      - name: labels
+        type: Dict[str, str]
+        default: null
+      - name: extra_data
+        type: dict
+        default: null
+      outputs:
+      - default: ''
+      lineno: 246
+    dpo_train:
+      name: dpo_train
+      doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\
+        \ dataset.\n The function takes various configuration parameters to customize\
+        \ the training process\n and adapt the model to specific tasks using a provided\
+        \ dataset."
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: mlrun context in order to log trained model
+        default: ''
+      - name: train_dataset
+        type: Union[str, mlrun.datastore.DataItem]
+        doc: The train dataset used for fine-tuning the language model.
+        default: ''
+      - name: eval_dataset
+        type: str
+        doc: The eval dataset used for evaluate the language model during training.
+        default: null
+      - name: train_load_dataset_kwargs
+        type: dict
+        doc: kwargs for dataset loading
+        default: {}
+      - name: eval_load_dataset_kwargs
+        type: dict
+        doc: kwargs for dataset loading
+        default: {}
+      - name: dataset_columns_to_train
+        type: Union[str, list]
+        doc: which columns to pass to the model as inputs
+        default: text
+      - name: model
+        type: Union[str, List[str]]
+        doc: a tuple containing model name and class, or str with model name or path
+        default: huggingface-model
+      - name: tokenizer
+        type: Union[str, List[str]]
+        doc: a tuple containing tokenizer name and class, or str with tokenizer name
+          or path
+        default: null
+      - name: deepspeed_config
+        type: Union[dict, bool]
+        doc: Configuration options for DeepSpeed (optional).
+        default: false
+      - name: quantization_config
+        type: Union[dict, bool]
+        doc: Configuration options for model quantization (optional).
+        default: false
+      - name: peft_config
+        type: Union[dict, bool]
+        default: false
+      - name: beta
+        type: Union[float, bool]
+        default: false
+      - name: training_config
+        type: dict
+        doc: Configuration options specific to the fine-tuning training process (optional).
+        default: {}
+      - name: model_pretrained_config
+        type: dict
+        doc: config to load the pretrained model
+        default: {}
+      - name: tokenizer_pretrained_config
+        type: dict
+        doc: config to load the pretrained tokenizer
+        default: {}
+      - name: data_collator_config
+        type: dict
+        doc: Configuration options for data collation during training (optional).
+        default: {}
+      - name: task
+        type: str
+        doc: A description of the specific task the model is being fine-tuned for.
+        default: text-generation
+      - name: use_cuda
+        type: bool
+        doc: use gpu or not
+        default: true
+      - name: framework
+        type: str
+        doc: pt ot tf
+        default: pt
+      - name: device_map
+        type: str
+        default: auto
+      outputs:
+      - default: ''
+      lineno: 627
+    evaluate:
+      name: evaluate
+      doc: 'Evaluating the model using perplexity, for more information visit:
+
+        https://huggingface.co/docs/transformers/perplexity'
+      parameters:
+      - name: context
+        doc: mlrun context
+        default: ''
+      - name: model_path
+        doc: path to the model directory
+        default: ''
+      - name: data
+        type: DataFrame
+        doc: the data to evaluate the model
+        default: ''
+      - name: model_name
+        type: str
+        doc: name of base model
+        default: null
+      - name: tokenizer_name
+        type: str
+        doc: name of base tokenizer
+        default: null
+      outputs:
+      - default: ''
+      lineno: 785
+  description: doing the alignment with dpo trainer
+  default_handler: dpo_train
+  disable_auto_mount: false
+  clone_target_dir: ''
+  env: []
+  resources:
+    requests:
+      memory: 1Mi
+      cpu: 25m
+    limits:
+      memory: 20Gi
+      cpu: '2'
+  priority_class_name: igz-workload-medium
+  preemption_mode: prevent
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: app.iguazio.com/lifecycle
+            operator: NotIn
+            values:
+            - preemptible
+          - key: eks.amazonaws.com/capacityType
+            operator: NotIn
+            values:
+            - SPOT
+          - key: node-lifecycle
+            operator: NotIn
+            values:
+            - spot
+  tolerations: null
+  security_context: {}
+verbose: false

From b343632065b93709652d39314b93e6b2b59f94a5 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 19:20:18 +0000
Subject: [PATCH 16/33] rename the trainer

---
 huggingface_dpo/huggingface_dpo.py            | 870 ++++++++++++++++++
 huggingface_dpo/huggingface_dpo_trainer.py    |   3 +-
 huggingface_dpo/test_huggingface_dpo.py       |  56 ++
 .../test_huggingface_dpo_trainer.py           |   6 +-
 4 files changed, 932 insertions(+), 3 deletions(-)
 create mode 100644 huggingface_dpo/huggingface_dpo.py
 create mode 100644 huggingface_dpo/test_huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
new file mode 100644
index 000000000..8dcf63b29
--- /dev/null
+++ b/huggingface_dpo/huggingface_dpo.py
@@ -0,0 +1,870 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from trl import DPOTrainer
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+class ConfigKeys:
+    deepspeed = "deepspeed"
+    quantization = "quantization"
+    training = "training"
+    tokenizer_pretrained = "tokenizer_pretrained"
+    model_pretrained = "model_pretrained"
+    peft_config = "peft_config"
+    data_collator = "data_collator"
+    beta = "beta"
+
+
+# ----------------------from MLRUN--------------------------------
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: DPOTrainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+        def wrapper(self: DPOTrainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+        if self._is_training:
+            return
+
+    def log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self.log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def apply_mlrun(
+    trainer: trl.DPOTrainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    """
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+# ----------------------end from MLRUN--------------------------------
+
+
+def _print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
+        f" {100 * trainable_params / all_param}"
+    )
+
+
+# default configs
+# will be used if user provides "True" with config name as input
+QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+PEFT_CONFIG = peft.LoraConfig(
+    r=8,
+    lora_alpha=16,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+DEEPSPEED_CONFIG = {
+    "train_micro_batch_size_per_gpu": "auto",
+    "fp16": {"enabled": True},
+    "autotuning": {
+        "enabled": True,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
+        },
+    },
+    "zero_optimization": {
+        "stage": 2,
+    },
+}
+
+
+def _update_config(src: dict, dst: dict):
+    """
+    update configs according to user, this way the user can add/modify values in default configs for e.g.
+
+    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
+     with the prefix and add them to appropriate config
+
+    :param src: dict of all candidate values to update dict.
+    :param dst: dict containing all configs to update.
+    """
+
+    for config_name, config in dst.items():
+
+        # If given True we use default dict
+        # Can also be False or a config dict given from user, so we check specifically fo True
+        if config is True and config_name == "quantization":
+            config = QUANTIZATION_CONFIG
+
+        if config is True and config_name == "lora":
+            config = PEFT_CONFIG
+
+        if config is True and config_name == "deepspeed":
+            config = DEEPSPEED_CONFIG
+
+        # in some cases we can get a boolean value, in that case no need to look for args
+        if isinstance(config, bool):
+            config = None
+
+        elif isinstance(config, dict):
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    config[key.replace(f"{config_name}_", "")] = val
+
+        # update by config name
+        else:
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    setattr(config, key.replace(f"{config_name}_", ""), val)
+
+        dst.update({config_name: config})
+
+
+def _get_class_object(class_path: str) -> type:
+    """
+    given a full class name, this function returns the correct class
+
+    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
+
+    :return the wanted class object
+    """
+    module_path, class_name = class_path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _set_model_and_tokenizer(
+    model: Union[str, List[str]],
+    tokenizer: Union[str, List[str]],
+    task: str,
+    framework: str,
+    quantization_config: dict,
+    use_cuda: bool,
+    tokenizer_pretrained_config,
+    model_pretrained_config,
+    device_map: str,
+):
+    """
+    get the correct model and tokenizer according to given user inputs
+
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param task: a supported nlp task, used to choose model if not provided
+    :param framework: pt or tf
+    :param quantization_config: quantization config or None, to load model in appropriate way
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param device_map: a device map for model training if using number of gpu's
+
+    :returns: model and tokenizer
+    """
+    # if task is not supported and no model was given we can't choose one
+    if task and task not in supported_tasks and not model:
+        logger.error("unsupported task option chosen")
+        raise
+
+    # load model from store
+    if isinstance(model, str) and is_store_uri(model):
+        pass
+        # TODO: load both model and tokenizer and return, need guy's help
+
+    # if it's a tuple them we assume it contains of both name and class
+    if isinstance(model, list):
+        model_name, model_class = model
+        model_class = _get_class_object(model_class)
+
+    # in the case we don't get the model class we need the task in order to choose the correct model
+    else:
+        if task is None:
+            logger.error("task must be chosen in order to determine the correct model")
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        _, available_classes, task_options = transformers.pipelines.check_task(task)
+
+        if isinstance(model, str):
+            model_name = model
+
+        # if model is not given, we take the default model for the given task
+        else:
+            model_name, _ = transformers.pipelines.get_default_model_and_revision(
+                available_classes, framework, task_options
+            )
+        if not available_classes.get(framework, tuple()):
+            logger.error(
+                "given task's default model is not supported in specified framework"
+            )
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        model_class = available_classes[framework][0]
+
+    # load the pretrained model
+    if use_cuda:
+        device_map = device_map
+    else:
+        device_map = None
+
+    model = model_class.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        **model_pretrained_config,
+    )
+
+    # If quantization config is given we will load a quantized model, if not a regular one
+    if quantization_config:
+        model.gradient_checkpointing_enable()
+        model = peft.prepare_model_for_kbit_training(model)
+
+    # if not specified we choose the default tokenizer that corresponding to the model
+    if tokenizer is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+        return model_name, model, tokenizer
+
+    if isinstance(tokenizer, str):
+        tokenizer_name = tokenizer
+        tokenizer_class = transformers.AutoTokenizer
+
+    # if it's not a str then it's a tuple of both name and class
+    else:
+        tokenizer_name, tokenizer_class = tokenizer
+        tokenizer_class = _get_class_object(tokenizer_class)
+
+    tokenizer = tokenizer_class.from_pretrained(
+        tokenizer_name, **tokenizer_pretrained_config
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model_name, model, tokenizer
+
+
+def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
+    """
+    loads the specific dataset provided by the user
+
+    :param dataset: name or path of dataset to load
+    :param is_train: bool that indicates the purpose of the dataset
+    :param kwargs: other kwargs for loading the dataset
+
+    :returns: loaded dataset
+    """
+    # if split in kwargs then the user decides how to split the dataset
+    if "split" in kwargs:
+        return load_dataset(dataset, **kwargs)
+
+    # if it's a dataset for train we split with train
+    if is_train:
+        return load_dataset(dataset, split="train", **kwargs)
+
+    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
+    dataset = load_dataset(dataset, **kwargs)
+    if "test" in dataset:
+        return dataset.get("test")
+    elif "eval" in dataset:
+        return dataset.get("eval")
+    elif "validation" in dataset:
+        return dataset.get("validation")
+
+
+def _prepare_dataset(
+    train_dataset: str,
+    eval_dataset: str,
+    train_load_dataset_kwargs,
+    eval_load_dataset_kwargs,
+    tokenizer,
+    dataset_columns_to_train: Union[str, list],
+) -> (Dataset, Union[Dataset, None]):
+    """
+    Loads the train and eval datasets (if provided) passes them through the tokenizer and
+    returns them ready to use in training
+
+    :param train_dataset: the name or path to the train dataset
+    :param eval_dataset: the name or path to the eval dataset
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+                                        (need to pass through the tokenizer first)
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param tokenizer: the tokenizer to pass the data through
+
+    :returns: tokenized datasets
+    """
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # we take col name/s in a list for easy generalization
+    if isinstance(dataset_columns_to_train, str):
+        dataset_columns_to_train = [dataset_columns_to_train]
+
+    if isinstance(train_dataset, mlrun.datastore.DataItem):
+        train_dataset = Dataset.from_pandas(train_dataset.as_df())
+        return (
+            train_dataset.map(
+                lambda examples: tokenizer(
+                    *[examples[col] for col in dataset_columns_to_train],
+                    truncation=True,
+                    padding=True,
+                ),
+                batched=True,
+            ),
+            None,
+        )
+
+    # Load datasets
+    # if provided two paths/names we load each separately using designated func
+    if eval_dataset:
+        train_dataset = _dataset_loader(
+            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
+        )
+        eval_dataset = _dataset_loader(
+            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
+        )
+
+    # if only on path is given then we must check if it contains both dataset or if only one should be used
+    else:
+        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
+        if "train" in dataset:
+            train_dataset = dataset.get("train")
+            if "test" in dataset:
+                eval_dataset = dataset.get("test")
+            elif "eval" in dataset:
+                eval_dataset = dataset.get("eval")
+            elif "validation" in dataset:
+                eval_dataset = dataset.get("validation")
+            else:
+                # only train dataset given, tokenize and return it
+                return (
+                    train_dataset.map(
+                        lambda examples: tokenizer(
+                            *[examples[col] for col in dataset_columns_to_train],
+                            truncation=True,
+                            padding=True,
+                        ),
+                        batched=True,
+                    ),
+                    None,
+                )
+        else:
+            logger.error("train dataset is mandatory")
+            raise KeyError("no train dataset found in given dataset")
+
+    # Tokenize the data so the model can understand it
+    tokenized_train_dataset = train_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    tokenized_eval_dataset = eval_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    return tokenized_train_dataset, tokenized_eval_dataset
+
+
+def dpo_train(
+    context: mlrun.MLClientCtx,
+    train_dataset: Union[str, mlrun.datastore.DataItem],
+    eval_dataset: str = None,
+    train_load_dataset_kwargs: dict = {},
+    eval_load_dataset_kwargs: dict = {},
+    dataset_columns_to_train: Union[str, list] = "text",
+    model: Union[str, List[str]] = "huggingface-model",
+    tokenizer: Union[str, List[str]] = None,
+    deepspeed_config: Union[dict, bool] = False,
+    quantization_config: Union[dict, bool] = False,
+    peft_config: Union[dict, bool] = False,
+    beta: Union[float, bool] = False,
+    training_config: dict = {},
+    model_pretrained_config: dict = {},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
+    task: str = "text-generation",
+    use_cuda: bool = True,
+    framework: str = "pt",
+    device_map: str = "auto",
+    **kwargs,
+):
+    """
+    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+     The function takes various configuration parameters to customize the training process
+     and adapt the model to specific tasks using a provided dataset.
+
+    :param context: mlrun context in order to log trained model
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param framework: pt ot tf
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param train_dataset: The train dataset used for fine-tuning the language model.
+    :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param deepspeed_config: Configuration options for DeepSpeed (optional).
+    :param quantization_config: Configuration options for model quantization (optional).
+    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param data_collator_config: Configuration options for data collation during training (optional).
+    :param task: A description of the specific task the model is being fine-tuned for.
+    :param kwargs: Additional keyword arguments.
+    """
+
+    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
+    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
+
+    # Look for updates to configs given in kwargs
+    configs = {
+        ConfigKeys.deepspeed: deepspeed_config,
+        ConfigKeys.quantization: quantization_config,
+        ConfigKeys.training: training_config,
+        ConfigKeys.model_pretrained: model_pretrained_config,
+        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
+        ConfigKeys.data_collator: data_collator_config,
+        ConfigKeys.peft_config: peft_config,
+        ConfigKeys.beta: beta,
+    }
+    _update_config(dst=configs, src=kwargs)
+
+    # check gpu permission and availability
+    if use_cuda:
+        if torch.cuda.is_available():
+            # Clean gpu cache
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("'use_cuda' is set to True, but no cuda device is available")
+
+    # get model and tokenizer
+    model_name, model, tokenizer = _set_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        task=task,
+        framework=framework,
+        quantization_config=configs[ConfigKeys.quantization],
+        use_cuda=use_cuda,
+        tokenizer_pretrained_config=tokenizer_pretrained_config,
+        model_pretrained_config=configs[ConfigKeys.model_pretrained],
+        device_map=device_map,
+    )
+
+    # Load datasets
+    tokenized_train, tokenized_eval = _prepare_dataset(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        train_load_dataset_kwargs=train_load_dataset_kwargs,
+        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+        tokenizer=tokenizer,
+        dataset_columns_to_train=dataset_columns_to_train,
+    )
+
+    # Initialize the data collator for the trainer to use in order to create batches of data
+    data_collator = transformers.DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False, **data_collator_config
+    )
+
+    # Initialize training kwargs from user kwargs:
+    train_kwargs = configs[ConfigKeys.training]
+
+    # If deepspeed config given we add it to training kwargs
+    if configs[ConfigKeys.deepspeed]:
+        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
+
+    # Take a look at the trainable parameters in the model
+    _print_trainable_parameters(model)
+
+    # Preparing training arguments:
+    training_args = transformers.TrainingArguments(
+        output_dir=tempfile.mkdtemp(),
+        **train_kwargs,
+    )
+
+    trainer = trl.DPOTrainer(
+        model=model,
+        ref_model = None,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        peft_config=configs[ConfigKeys.peft_config],
+        beta = configs[ConfigKeys.beta],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=training_args,
+    )
+
+    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
+    model.config.use_cache = (
+        False  # silence the warnings. Please re-enable for inference!
+    )
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+    temp_directory = tempfile.TemporaryDirectory().name
+    trainer.save_model(temp_directory)
+
+    # Zip the model directory:
+    shutil.make_archive(
+        base_name="model",
+        format="zip",
+        root_dir=temp_directory,
+    )
+
+    # Log the model:
+    context.log_model(
+        key="model",
+        db_key=model_name.split("/")[-1],
+        model_file="model.zip",
+        tag="",
+        framework="Hugging Face",
+    )
+
+
+def evaluate(
+    context,
+    model_path,
+    data: pd.DataFrame,
+    model_name: str = None,
+    tokenizer_name: str = None,
+):
+    """
+    Evaluating the model using perplexity, for more information visit:
+    https://huggingface.co/docs/transformers/perplexity
+
+    :param context:     mlrun context
+    :param model_path:  path to the model directory
+    :param data:        the data to evaluate the model
+    :param model_name:  name of base model
+    :param tokenizer_name: name of base tokenizer
+    """
+    # Get the model artifact and file:
+    (
+        model_file,
+        model_artifact,
+        extra_data,
+    ) = mlrun.artifacts.get_model(model_path)
+
+    # Read the name:
+    _model_name = model_artifact.spec.db_key
+
+    # Extract logged model files:
+    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_directory)
+
+    # Loading the saved pretrained tokenizer and model:
+    dataset = Dataset.from_pandas(data)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
+    )
+    model = PeftModel.from_pretrained(model, model_directory)
+    model.eval()
+    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
+
+    max_length = 1024
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids.cuda(), labels=target_ids)
+
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
+            neg_log_likelihood = outputs.loss
+
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    ppl = torch.exp(torch.stack(nlls).mean()).item()
+    context.log_result("perplexity", ppl)
diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 5f2a680d0..0eb076dde 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -675,7 +675,8 @@ def dpo_train(
 
     # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
     # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
+    import pdb
+    pdb.set_trace()
     # Look for updates to configs given in kwargs
     configs = {
         ConfigKeys.deepspeed: deepspeed_config,
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
new file mode 100644
index 000000000..b310aaf37
--- /dev/null
+++ b/huggingface_dpo/test_huggingface_dpo.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+
+import mlrun
+
+
+def test_dpo_train():
+
+    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    tokenizer = model_name
+    auto_trainer = mlrun.import_function("function.yaml")
+
+    training_arguments = {
+        "per_device_train_batch_size": 4,
+        "gradient_accumulation_steps": 1,
+        "warmup_steps": 2,
+        "max_steps": 10,
+        "learning_rate": 2e-4,
+        "logging_steps": 1,
+    }
+
+    params = {
+        "model": (model_name, "transformers.AutoModelForCausalLM"),
+        "tokenizer": tokenizer,
+        "train_dataset": "HuggingFaceH4/orca_dpo_pairs",
+        "training_config": training_arguments,
+        "dataset_columns_to_train": "quote",
+        "model_pretrained_config": {"use_cache": False},
+        "use_cuda": False,
+    }
+
+    try:
+        with tempfile.TemporaryDirectory() as test_directory:
+            auto_trainer.run(
+                local=True,
+                params=params,
+                handler="dpo_train",
+                returns=["model"],
+                workdir=test_directory,
+            )
+
+    except Exception as exception:
+        print(f"- The training failed - raised the following error:\n- {exception}")
diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 7899debba..d2cfaaf02 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -7,7 +7,7 @@ def test_dpo_train():
 
     model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name
-    auto_trainer = mlrun.import_function("function.yaml")
+    dop_trainer = mlrun.import_function("function.yaml")
 
     training_arguments = {
         "per_device_train_batch_size": 4,
@@ -20,17 +20,19 @@ def test_dpo_train():
 
     params = {
         "model": (model_name, "transformers.AutoModelForCausalLM"),
+        "ref_model": None,
         "tokenizer": tokenizer,
         "train_dataset": "Abirate/english_quotes",
         "training_config": training_arguments,
         "dataset_columns_to_train": "quote",
         "model_pretrained_config": {"use_cache": False},
+
         "use_cuda": False,
     }
 
     try:
         with tempfile.TemporaryDirectory() as test_directory:
-            auto_trainer.run(
+            dpo_trainer.run(
                 local=True,
                 params=params,
                 handler="dpo_train",

From 6b28938560829ca49cb11f8570550e26333a50c7 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 19 Mar 2024 19:21:00 +0000
Subject: [PATCH 17/33] get rid of the older version

---
 huggingface_dpo/huggingface_dpo.py      | 870 ------------------------
 huggingface_dpo/test_huggingface_dpo.py |  56 --
 2 files changed, 926 deletions(-)
 delete mode 100644 huggingface_dpo/huggingface_dpo.py
 delete mode 100644 huggingface_dpo/test_huggingface_dpo.py

diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py
deleted file mode 100644
index 8dcf63b29..000000000
--- a/huggingface_dpo/huggingface_dpo.py
+++ /dev/null
@@ -1,870 +0,0 @@
-# Copyright 2024 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from trl import DPOTrainer
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-class ConfigKeys:
-    deepspeed = "deepspeed"
-    quantization = "quantization"
-    training = "training"
-    tokenizer_pretrained = "tokenizer_pretrained"
-    model_pretrained = "model_pretrained"
-    peft_config = "peft_config"
-    data_collator = "data_collator"
-    beta = "beta"
-
-
-# ----------------------from MLRUN--------------------------------
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: DPOTrainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-        def wrapper(self: DPOTrainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-        if self._is_training:
-            return
-
-    def log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self.log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def apply_mlrun(
-    trainer: trl.DPOTrainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    """
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-# ----------------------end from MLRUN--------------------------------
-
-
-def _print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
-        f" {100 * trainable_params / all_param}"
-    )
-
-
-# default configs
-# will be used if user provides "True" with config name as input
-QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-
-PEFT_CONFIG = peft.LoraConfig(
-    r=8,
-    lora_alpha=16,
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
-        "gate_proj", "up_proj", "down_proj"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-DEEPSPEED_CONFIG = {
-    "train_micro_batch_size_per_gpu": "auto",
-    "fp16": {"enabled": True},
-    "autotuning": {
-        "enabled": True,
-        "arg_mappings": {
-            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
-        },
-    },
-    "zero_optimization": {
-        "stage": 2,
-    },
-}
-
-
-def _update_config(src: dict, dst: dict):
-    """
-    update configs according to user, this way the user can add/modify values in default configs for e.g.
-
-    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
-     with the prefix and add them to appropriate config
-
-    :param src: dict of all candidate values to update dict.
-    :param dst: dict containing all configs to update.
-    """
-
-    for config_name, config in dst.items():
-
-        # If given True we use default dict
-        # Can also be False or a config dict given from user, so we check specifically fo True
-        if config is True and config_name == "quantization":
-            config = QUANTIZATION_CONFIG
-
-        if config is True and config_name == "lora":
-            config = PEFT_CONFIG
-
-        if config is True and config_name == "deepspeed":
-            config = DEEPSPEED_CONFIG
-
-        # in some cases we can get a boolean value, in that case no need to look for args
-        if isinstance(config, bool):
-            config = None
-
-        elif isinstance(config, dict):
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    config[key.replace(f"{config_name}_", "")] = val
-
-        # update by config name
-        else:
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    setattr(config, key.replace(f"{config_name}_", ""), val)
-
-        dst.update({config_name: config})
-
-
-def _get_class_object(class_path: str) -> type:
-    """
-    given a full class name, this function returns the correct class
-
-    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
-
-    :return the wanted class object
-    """
-    module_path, class_name = class_path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-
-
-def _set_model_and_tokenizer(
-    model: Union[str, List[str]],
-    tokenizer: Union[str, List[str]],
-    task: str,
-    framework: str,
-    quantization_config: dict,
-    use_cuda: bool,
-    tokenizer_pretrained_config,
-    model_pretrained_config,
-    device_map: str,
-):
-    """
-    get the correct model and tokenizer according to given user inputs
-
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param task: a supported nlp task, used to choose model if not provided
-    :param framework: pt or tf
-    :param quantization_config: quantization config or None, to load model in appropriate way
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param device_map: a device map for model training if using number of gpu's
-
-    :returns: model and tokenizer
-    """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
-    # load model from store
-    if isinstance(model, str) and is_store_uri(model):
-        pass
-        # TODO: load both model and tokenizer and return, need guy's help
-
-    # if it's a tuple them we assume it contains of both name and class
-    if isinstance(model, list):
-        model_name, model_class = model
-        model_class = _get_class_object(model_class)
-
-    # in the case we don't get the model class we need the task in order to choose the correct model
-    else:
-        if task is None:
-            logger.error("task must be chosen in order to determine the correct model")
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        _, available_classes, task_options = transformers.pipelines.check_task(task)
-
-        if isinstance(model, str):
-            model_name = model
-
-        # if model is not given, we take the default model for the given task
-        else:
-            model_name, _ = transformers.pipelines.get_default_model_and_revision(
-                available_classes, framework, task_options
-            )
-        if not available_classes.get(framework, tuple()):
-            logger.error(
-                "given task's default model is not supported in specified framework"
-            )
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        model_class = available_classes[framework][0]
-
-    # load the pretrained model
-    if use_cuda:
-        device_map = device_map
-    else:
-        device_map = None
-
-    model = model_class.from_pretrained(
-        model_name,
-        quantization_config=quantization_config,
-        device_map=device_map,
-        **model_pretrained_config,
-    )
-
-    # If quantization config is given we will load a quantized model, if not a regular one
-    if quantization_config:
-        model.gradient_checkpointing_enable()
-        model = peft.prepare_model_for_kbit_training(model)
-
-    # if not specified we choose the default tokenizer that corresponding to the model
-    if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-        return model_name, model, tokenizer
-
-    if isinstance(tokenizer, str):
-        tokenizer_name = tokenizer
-        tokenizer_class = transformers.AutoTokenizer
-
-    # if it's not a str then it's a tuple of both name and class
-    else:
-        tokenizer_name, tokenizer_class = tokenizer
-        tokenizer_class = _get_class_object(tokenizer_class)
-
-    tokenizer = tokenizer_class.from_pretrained(
-        tokenizer_name, **tokenizer_pretrained_config
-    )
-
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model_name, model, tokenizer
-
-
-def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
-    """
-    loads the specific dataset provided by the user
-
-    :param dataset: name or path of dataset to load
-    :param is_train: bool that indicates the purpose of the dataset
-    :param kwargs: other kwargs for loading the dataset
-
-    :returns: loaded dataset
-    """
-    # if split in kwargs then the user decides how to split the dataset
-    if "split" in kwargs:
-        return load_dataset(dataset, **kwargs)
-
-    # if it's a dataset for train we split with train
-    if is_train:
-        return load_dataset(dataset, split="train", **kwargs)
-
-    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
-    dataset = load_dataset(dataset, **kwargs)
-    if "test" in dataset:
-        return dataset.get("test")
-    elif "eval" in dataset:
-        return dataset.get("eval")
-    elif "validation" in dataset:
-        return dataset.get("validation")
-
-
-def _prepare_dataset(
-    train_dataset: str,
-    eval_dataset: str,
-    train_load_dataset_kwargs,
-    eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
-) -> (Dataset, Union[Dataset, None]):
-    """
-    Loads the train and eval datasets (if provided) passes them through the tokenizer and
-    returns them ready to use in training
-
-    :param train_dataset: the name or path to the train dataset
-    :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
-
-    :returns: tokenized datasets
-    """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
-
-    # Load datasets
-    # if provided two paths/names we load each separately using designated func
-    if eval_dataset:
-        train_dataset = _dataset_loader(
-            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
-        )
-        eval_dataset = _dataset_loader(
-            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
-        )
-
-    # if only on path is given then we must check if it contains both dataset or if only one should be used
-    else:
-        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
-        if "train" in dataset:
-            train_dataset = dataset.get("train")
-            if "test" in dataset:
-                eval_dataset = dataset.get("test")
-            elif "eval" in dataset:
-                eval_dataset = dataset.get("eval")
-            elif "validation" in dataset:
-                eval_dataset = dataset.get("validation")
-            else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
-        else:
-            logger.error("train dataset is mandatory")
-            raise KeyError("no train dataset found in given dataset")
-
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
-
-
-def dpo_train(
-    context: mlrun.MLClientCtx,
-    train_dataset: Union[str, mlrun.datastore.DataItem],
-    eval_dataset: str = None,
-    train_load_dataset_kwargs: dict = {},
-    eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
-    model: Union[str, List[str]] = "huggingface-model",
-    tokenizer: Union[str, List[str]] = None,
-    deepspeed_config: Union[dict, bool] = False,
-    quantization_config: Union[dict, bool] = False,
-    peft_config: Union[dict, bool] = False,
-    beta: Union[float, bool] = False,
-    training_config: dict = {},
-    model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
-    task: str = "text-generation",
-    use_cuda: bool = True,
-    framework: str = "pt",
-    device_map: str = "auto",
-    **kwargs,
-):
-    """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
-     The function takes various configuration parameters to customize the training process
-     and adapt the model to specific tasks using a provided dataset.
-
-    :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param train_dataset: The train dataset used for fine-tuning the language model.
-    :param eval_dataset: The eval dataset used for evaluate the language model during training.
-    :param deepspeed_config: Configuration options for DeepSpeed (optional).
-    :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
-    :param training_config: Configuration options specific to the fine-tuning training process (optional).
-    :param data_collator_config: Configuration options for data collation during training (optional).
-    :param task: A description of the specific task the model is being fine-tuned for.
-    :param kwargs: Additional keyword arguments.
-    """
-
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
-    # Look for updates to configs given in kwargs
-    configs = {
-        ConfigKeys.deepspeed: deepspeed_config,
-        ConfigKeys.quantization: quantization_config,
-        ConfigKeys.training: training_config,
-        ConfigKeys.model_pretrained: model_pretrained_config,
-        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
-        ConfigKeys.data_collator: data_collator_config,
-        ConfigKeys.peft_config: peft_config,
-        ConfigKeys.beta: beta,
-    }
-    _update_config(dst=configs, src=kwargs)
-
-    # check gpu permission and availability
-    if use_cuda:
-        if torch.cuda.is_available():
-            # Clean gpu cache
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("'use_cuda' is set to True, but no cuda device is available")
-
-    # get model and tokenizer
-    model_name, model, tokenizer = _set_model_and_tokenizer(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        framework=framework,
-        quantization_config=configs[ConfigKeys.quantization],
-        use_cuda=use_cuda,
-        tokenizer_pretrained_config=tokenizer_pretrained_config,
-        model_pretrained_config=configs[ConfigKeys.model_pretrained],
-        device_map=device_map,
-    )
-
-    # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
-
-    # Initialize training kwargs from user kwargs:
-    train_kwargs = configs[ConfigKeys.training]
-
-    # If deepspeed config given we add it to training kwargs
-    if configs[ConfigKeys.deepspeed]:
-        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
-
-    # Take a look at the trainable parameters in the model
-    _print_trainable_parameters(model)
-
-    # Preparing training arguments:
-    training_args = transformers.TrainingArguments(
-        output_dir=tempfile.mkdtemp(),
-        **train_kwargs,
-    )
-
-    trainer = trl.DPOTrainer(
-        model=model,
-        ref_model = None,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
-        peft_config=configs[ConfigKeys.peft_config],
-        beta = configs[ConfigKeys.beta],
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        args=training_args,
-    )
-
-    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-    temp_directory = tempfile.TemporaryDirectory().name
-    trainer.save_model(temp_directory)
-
-    # Zip the model directory:
-    shutil.make_archive(
-        base_name="model",
-        format="zip",
-        root_dir=temp_directory,
-    )
-
-    # Log the model:
-    context.log_model(
-        key="model",
-        db_key=model_name.split("/")[-1],
-        model_file="model.zip",
-        tag="",
-        framework="Hugging Face",
-    )
-
-
-def evaluate(
-    context,
-    model_path,
-    data: pd.DataFrame,
-    model_name: str = None,
-    tokenizer_name: str = None,
-):
-    """
-    Evaluating the model using perplexity, for more information visit:
-    https://huggingface.co/docs/transformers/perplexity
-
-    :param context:     mlrun context
-    :param model_path:  path to the model directory
-    :param data:        the data to evaluate the model
-    :param model_name:  name of base model
-    :param tokenizer_name: name of base tokenizer
-    """
-    # Get the model artifact and file:
-    (
-        model_file,
-        model_artifact,
-        extra_data,
-    ) = mlrun.artifacts.get_model(model_path)
-
-    # Read the name:
-    _model_name = model_artifact.spec.db_key
-
-    # Extract logged model files:
-    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_directory)
-
-    # Loading the saved pretrained tokenizer and model:
-    dataset = Dataset.from_pandas(data)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
-    )
-    model = PeftModel.from_pretrained(model, model_directory)
-    model.eval()
-    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
-
-    max_length = 1024
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-
-        with torch.no_grad():
-            outputs = model(input_ids.cuda(), labels=target_ids)
-
-            # loss is calculated using CrossEntropyLoss which averages over valid labels
-            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-            # to the left by 1.
-            neg_log_likelihood = outputs.loss
-
-        nlls.append(neg_log_likelihood)
-
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-
-    ppl = torch.exp(torch.stack(nlls).mean()).item()
-    context.log_result("perplexity", ppl)
diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py
deleted file mode 100644
index b310aaf37..000000000
--- a/huggingface_dpo/test_huggingface_dpo.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2024 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import mlrun
-
-
-def test_dpo_train():
-
-    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
-    tokenizer = model_name
-    auto_trainer = mlrun.import_function("function.yaml")
-
-    training_arguments = {
-        "per_device_train_batch_size": 4,
-        "gradient_accumulation_steps": 1,
-        "warmup_steps": 2,
-        "max_steps": 10,
-        "learning_rate": 2e-4,
-        "logging_steps": 1,
-    }
-
-    params = {
-        "model": (model_name, "transformers.AutoModelForCausalLM"),
-        "tokenizer": tokenizer,
-        "train_dataset": "HuggingFaceH4/orca_dpo_pairs",
-        "training_config": training_arguments,
-        "dataset_columns_to_train": "quote",
-        "model_pretrained_config": {"use_cache": False},
-        "use_cuda": False,
-    }
-
-    try:
-        with tempfile.TemporaryDirectory() as test_directory:
-            auto_trainer.run(
-                local=True,
-                params=params,
-                handler="dpo_train",
-                returns=["model"],
-                workdir=test_directory,
-            )
-
-    except Exception as exception:
-        print(f"- The training failed - raised the following error:\n- {exception}")

From 5239e5554a5553047c2f3f3b0fe67df5b75fcb7a Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 20 Mar 2024 01:40:22 +0000
Subject: [PATCH 18/33] can trigger the run. seems don't need override the
 dataloader

---
 huggingface_dpo/huggingface_dpo_trainer.py    | 53 ++++++++++++-------
 huggingface_dpo/requirements.txt              |  1 +
 .../test_huggingface_dpo_trainer.py           | 52 ++++++++++++++++++
 3 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 0eb076dde..64389c23c 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -1,3 +1,17 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib
 import os
 import shutil
@@ -244,7 +258,7 @@ def log_metric_plot(self, name: str, scores: List[float]):
 
 
 def apply_mlrun(
-    trainer: trl.DPOTrainer,
+    trainer: DPOTrainer,
     model_name: str = None,
     tag: str = "",
     context: mlrun.MLClientCtx = None,
@@ -675,8 +689,6 @@ def dpo_train(
 
     # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
     # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-    import pdb
-    pdb.set_trace()
     # Look for updates to configs given in kwargs
     configs = {
         ConfigKeys.deepspeed: deepspeed_config,
@@ -710,21 +722,24 @@ def dpo_train(
         model_pretrained_config=configs[ConfigKeys.model_pretrained],
         device_map=device_map,
     )
-
+    whole_dataset = load_dataset(train_dataset, split='train')
+    whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3)
+    train_dataset =  whole_dataset['train']
+    eval_dataset = whole_dataset['test']
     # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
+    #tokenized_train, tokenized_eval = _prepare_dataset(
+    #    train_dataset=train_dataset,
+    #    eval_dataset=eval_dataset,
+    #    train_load_dataset_kwargs=train_load_dataset_kwargs,
+    #    eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+    #    tokenizer=tokenizer,
+    #    dataset_columns_to_train=dataset_columns_to_train,
+    #)
 
     # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
+    #data_collator = transformers.DataCollatorForLanguageModeling(
+    #    tokenizer=tokenizer, mlm=False, **data_collator_config
+    #)
 
     # Initialize training kwargs from user kwargs:
     train_kwargs = configs[ConfigKeys.training]
@@ -742,15 +757,15 @@ def dpo_train(
         **train_kwargs,
     )
 
-    trainer = trl.DPOTrainer(
+    trainer = DPOTrainer(
         model=model,
         ref_model = None,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
         peft_config=configs[ConfigKeys.peft_config],
         beta = configs[ConfigKeys.beta],
         tokenizer=tokenizer,
-        data_collator=data_collator,
+        #data_collator=data_collator,
         args=training_args,
     )
 
diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt
index 215b90562..c03846397 100644
--- a/huggingface_dpo/requirements.txt
+++ b/huggingface_dpo/requirements.txt
@@ -5,3 +5,4 @@ datasets
 plotly
 trl
 mlrun
+bitsandbytes
diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index d2cfaaf02..fcd373759 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -1,7 +1,59 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import tempfile
+from huggingface_dpo_trainer import dpo_train
 
 import mlrun
 
+def test_dpo_fn():
+    ctx = mlrun.get_or_create_ctx(name='test_dpo')
+    train_dataset = "unalignment/toxic-dpo-v0.2"
+    training_arguments = {
+            "evaluation_strategy": "steps",
+            "do_eval": True,
+            "optim": "paged_adamw_8bit",
+            "per_device_train_batch_size": 1,
+            "gradient_accumulation_steps": 4,
+            "per_device_eval_batch_size": 1,
+            "log_level": "info",
+            "save_steps": 100,
+            "learning_rate": 5e-7,
+            "eval_steps": 100,
+            "num_train_epochs": 1,
+            "max_steps": 100,
+            "warmup_steps": 20,
+            "fp16": True,
+            "lr_scheduler_type": "cosine",
+            "remove_unused_columns": True,
+            "gradient_checkpointing": True,
+        }
+    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    tokenizer = model_name
+    dpo_train(
+           context = ctx,
+           train_dataset = train_dataset,
+           model = (model_name,"transformers.AutoModelForCausalLM"),
+           tokenizer = tokenizer,
+           dataset_columns_to_train = ['chosen', 'rejected'],
+           training_config = training_arguments,
+           use_cuda = True,
+           beta = 0.1,
+           split='train',
+           )
+
+    
 
 def test_dpo_train():
 

From 1f059b891203de9da7f365d546a622310c8e6722 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 20 Mar 2024 13:07:15 -0700
Subject: [PATCH 19/33] adding the maxlength

---
 huggingface_dpo/test_huggingface_dpo_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index fcd373759..6a434f7dc 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -51,6 +51,8 @@ def test_dpo_fn():
            use_cuda = True,
            beta = 0.1,
            split='train',
+           max_length=1024,
+           max_prompt_length=2048,
            )
 
     

From e5d079249016d264090d9840736dc261203b05ae Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 20 Mar 2024 15:42:16 -0700
Subject: [PATCH 20/33] get rid of the trainer interface

---
 huggingface_dpo/huggingface_dpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 64389c23c..349d98e1b 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -37,7 +37,7 @@
 from plotly import graph_objects as go
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          PreTrainedModel, PreTrainedTokenizer, 
                           TrainerCallback, TrainerControl, TrainerState,
                           TrainingArguments)
 

From eb300fd531888062b6f387d4c760a15735b3fa45 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 20 Mar 2024 22:43:04 +0000
Subject: [PATCH 21/33] override

---
 huggingface_dpo/huggingface_dpo_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 64389c23c..844d6b69d 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -767,6 +767,8 @@ def dpo_train(
         tokenizer=tokenizer,
         #data_collator=data_collator,
         args=training_args,
+        max_length=1024,
+        max_prompt_length=2048,
     )
 
     apply_mlrun(trainer, model_name=model_name.split("/")[-1])

From 14be77620ce1bfef106bc5dbf3aedaccd3372945 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Tue, 26 Mar 2024 21:51:09 +0000
Subject: [PATCH 22/33] training job can run but the artifact can't store

---
 huggingface_dpo/huggingface_dpo_trainer.py    | 30 ++++---------------
 .../test_huggingface_dpo_trainer.py           | 16 +++++-----
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index eddd74e8c..e50cb64af 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -41,19 +41,6 @@
                           TrainerCallback, TrainerControl, TrainerState,
                           TrainingArguments)
 
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
 
 class ConfigKeys:
     deepspeed = "deepspeed"
@@ -61,7 +48,7 @@ class ConfigKeys:
     training = "training"
     tokenizer_pretrained = "tokenizer_pretrained"
     model_pretrained = "model_pretrained"
-    peft_config = "peft_config"
+    peft_config = "peft"
     data_collator = "data_collator"
     beta = "beta"
 
@@ -317,7 +304,7 @@ def _print_trainable_parameters(model):
 )
 
 PEFT_CONFIG = peft.LoraConfig(
-    r=8,
+    r=16,
     lora_alpha=16,
     target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
         "gate_proj", "up_proj", "down_proj"],
@@ -360,7 +347,7 @@ def _update_config(src: dict, dst: dict):
         if config is True and config_name == "quantization":
             config = QUANTIZATION_CONFIG
 
-        if config is True and config_name == "lora":
+        if config is True and config_name == "peft":
             config = PEFT_CONFIG
 
         if config is True and config_name == "deepspeed":
@@ -423,11 +410,6 @@ def _set_model_and_tokenizer(
 
     :returns: model and tokenizer
     """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
     # load model from store
     if isinstance(model, str) and is_store_uri(model):
         pass
@@ -702,6 +684,7 @@ def dpo_train(
     }
     _update_config(dst=configs, src=kwargs)
 
+
     # check gpu permission and availability
     if use_cuda:
         if torch.cuda.is_available():
@@ -765,10 +748,9 @@ def dpo_train(
         peft_config=configs[ConfigKeys.peft_config],
         beta = configs[ConfigKeys.beta],
         tokenizer=tokenizer,
-        #data_collator=data_collator,
         args=training_args,
-        max_length=1024,
-        max_prompt_length=2048,
+        max_length=2048,
+        max_prompt_length=4096,
     )
 
     apply_mlrun(trainer, model_name=model_name.split("/")[-1])
diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 6a434f7dc..1f3a9a772 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -20,6 +20,7 @@
 def test_dpo_fn():
     ctx = mlrun.get_or_create_ctx(name='test_dpo')
     train_dataset = "unalignment/toxic-dpo-v0.2"
+
     training_arguments = {
             "evaluation_strategy": "steps",
             "do_eval": True,
@@ -28,12 +29,12 @@ def test_dpo_fn():
             "gradient_accumulation_steps": 4,
             "per_device_eval_batch_size": 1,
             "log_level": "info",
-            "save_steps": 100,
+            "save_steps": 2,
             "learning_rate": 5e-7,
-            "eval_steps": 100,
+            "eval_steps": 1,
             "num_train_epochs": 1,
-            "max_steps": 100,
-            "warmup_steps": 20,
+            "max_steps": 10,
+            "warmup_steps": 5,
             "fp16": True,
             "lr_scheduler_type": "cosine",
             "remove_unused_columns": True,
@@ -44,15 +45,12 @@ def test_dpo_fn():
     dpo_train(
            context = ctx,
            train_dataset = train_dataset,
-           model = (model_name,"transformers.AutoModelForCausalLM"),
+           peft_config=True,
+           model = model_name,
            tokenizer = tokenizer,
-           dataset_columns_to_train = ['chosen', 'rejected'],
            training_config = training_arguments,
            use_cuda = True,
            beta = 0.1,
-           split='train',
-           max_length=1024,
-           max_prompt_length=2048,
            )
 
     

From 8ed0555aea873e0bc4e840f64cbd6e58bcae9b2f Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 27 Mar 2024 00:02:26 +0000
Subject: [PATCH 23/33] why the artifact can be stored?

---
 huggingface_dpo/huggingface_dpo_trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index e50cb64af..9f5c00e19 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -241,7 +241,9 @@ def log_metric_plot(self, name: str, scores: List[float]):
         # Create the plotly artifact:
         artifact_name = f"{name}_plot"
         artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+        import pdb
+        pdb.set_trace()
+        #self._artifacts[artifact_name] = self._context.log_artifact(artifact)
 
 
 def apply_mlrun(

From 465c2087cb8a630c703f4a66ec7bfa6770965b32 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Wed, 27 Mar 2024 21:36:52 +0000
Subject: [PATCH 24/33] solved the naming issue, now can store the artifact

---
 huggingface_dpo/huggingface_dpo_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 9f5c00e19..fa534f631 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -239,11 +239,11 @@ def log_metric_plot(self, name: str, scores: List[float]):
         )
 
         # Create the plotly artifact:
+        if '/' in name:
+            name = '_'.join(name.split('/'))
         artifact_name = f"{name}_plot"
         artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        import pdb
-        pdb.set_trace()
-        #self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
 
 
 def apply_mlrun(

From 308d94f4bb08ab9a2b3170891bdfed70b9b507b6 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 04:59:04 +0000
Subject: [PATCH 25/33] testing

---
 huggingface_dpo/huggingface_dpo_trainer.py    | 115 +++---------------
 .../test_huggingface_dpo_trainer.py           |  17 +--
 2 files changed, 29 insertions(+), 103 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index fa534f631..823f83148 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -384,8 +384,6 @@ def _get_class_object(class_path: str) -> type:
     module_path, class_name = class_path.rsplit(".", 1)
     module = importlib.import_module(module_path)
     return getattr(module, class_name)
-
-
 def _set_model_and_tokenizer(
     model: Union[str, List[str]],
     tokenizer: Union[str, List[str]],
@@ -490,7 +488,6 @@ def _set_model_and_tokenizer(
 
     return model_name, model, tokenizer
 
-
 def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
     """
     loads the specific dataset provided by the user
@@ -517,6 +514,7 @@ def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
         return dataset.get("eval")
     elif "validation" in dataset:
         return dataset.get("validation")
+    return dataset
 
 
 def _prepare_dataset(
@@ -524,8 +522,6 @@ def _prepare_dataset(
     eval_dataset: str,
     train_load_dataset_kwargs,
     eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
 ) -> (Dataset, Union[Dataset, None]):
     """
     Loads the train and eval datasets (if provided) passes them through the tokenizer and
@@ -533,34 +529,11 @@ def _prepare_dataset(
 
     :param train_dataset: the name or path to the train dataset
     :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
     :param train_load_dataset_kwargs: kwargs for dataset loading
     :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
 
     :returns: tokenized datasets
     """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
 
     # Load datasets
     # if provided two paths/names we load each separately using designated func
@@ -571,7 +544,6 @@ def _prepare_dataset(
         eval_dataset = _dataset_loader(
             dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
         )
-
     # if only on path is given then we must check if it contains both dataset or if only one should be used
     else:
         dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
@@ -584,42 +556,13 @@ def _prepare_dataset(
             elif "validation" in dataset:
                 eval_dataset = dataset.get("validation")
             else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
+                return train_dataset
         else:
             logger.error("train dataset is mandatory")
             raise KeyError("no train dataset found in given dataset")
 
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
 
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
+    return train_dataset, eval_dataset
 
 
 def dpo_train(
@@ -628,7 +571,6 @@ def dpo_train(
     eval_dataset: str = None,
     train_load_dataset_kwargs: dict = {},
     eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
     model: Union[str, List[str]] = "huggingface-model",
     tokenizer: Union[str, List[str]] = None,
     deepspeed_config: Union[dict, bool] = False,
@@ -637,8 +579,8 @@ def dpo_train(
     beta: Union[float, bool] = False,
     training_config: dict = {},
     model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
+    tokenizer_pretrained_config: dict = {}, 
+    data_collator_config : dict={},
     task: str = "text-generation",
     use_cuda: bool = True,
     framework: str = "pt",
@@ -646,33 +588,31 @@ def dpo_train(
     **kwargs,
 ):
     """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+    Form a dpo training job to do llm alignment
      The function takes various configuration parameters to customize the training process
      and adapt the model to specific tasks using a provided dataset.
 
     :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
     :param train_dataset: The train dataset used for fine-tuning the language model.
     :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
     :param deepspeed_config: Configuration options for DeepSpeed (optional).
     :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param peft_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param beta: super parameter of KL divergence
     :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
     :param data_collator_config: Configuration options for data collation during training (optional).
     :param task: A description of the specific task the model is being fine-tuned for.
+    :param use_cuda: use gpu or not
+    :param framework: pt ot tf
     :param kwargs: Additional keyword arguments.
     """
 
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
     # Look for updates to configs given in kwargs
     configs = {
         ConfigKeys.deepspeed: deepspeed_config,
@@ -699,33 +639,16 @@ def dpo_train(
     model_name, model, tokenizer = _set_model_and_tokenizer(
         model=model,
         tokenizer=tokenizer,
-        task=task,
         framework=framework,
+        task = task,
         quantization_config=configs[ConfigKeys.quantization],
         use_cuda=use_cuda,
         tokenizer_pretrained_config=tokenizer_pretrained_config,
         model_pretrained_config=configs[ConfigKeys.model_pretrained],
         device_map=device_map,
     )
-    whole_dataset = load_dataset(train_dataset, split='train')
-    whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3)
-    train_dataset =  whole_dataset['train']
-    eval_dataset = whole_dataset['test']
-    # Load datasets
-    #tokenized_train, tokenized_eval = _prepare_dataset(
-    #    train_dataset=train_dataset,
-    #    eval_dataset=eval_dataset,
-    #    train_load_dataset_kwargs=train_load_dataset_kwargs,
-    #    eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-    #    tokenizer=tokenizer,
-    #    dataset_columns_to_train=dataset_columns_to_train,
-    #)
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    #data_collator = transformers.DataCollatorForLanguageModeling(
-    #    tokenizer=tokenizer, mlm=False, **data_collator_config
-    #)
-
+    train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs)
+    
     # Initialize training kwargs from user kwargs:
     train_kwargs = configs[ConfigKeys.training]
 
diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 1f3a9a772..64ec36886 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -18,40 +18,43 @@
 import mlrun
 
 def test_dpo_fn():
+    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    tokenizer = model_name
+    #dop_trainer = mlrun.import_function("function.yaml")
+
     ctx = mlrun.get_or_create_ctx(name='test_dpo')
     train_dataset = "unalignment/toxic-dpo-v0.2"
-
+    eval_dataset = "unalignment/toxic-dpo-v0.2"
     training_arguments = {
             "evaluation_strategy": "steps",
-            "do_eval": True,
+            "do_eval": False,
             "optim": "paged_adamw_8bit",
             "per_device_train_batch_size": 1,
             "gradient_accumulation_steps": 4,
             "per_device_eval_batch_size": 1,
             "log_level": "info",
-            "save_steps": 2,
+            "save_steps": 5,
             "learning_rate": 5e-7,
             "eval_steps": 1,
             "num_train_epochs": 1,
-            "max_steps": 10,
+            "max_steps": 5,
             "warmup_steps": 5,
             "fp16": True,
             "lr_scheduler_type": "cosine",
             "remove_unused_columns": True,
             "gradient_checkpointing": True,
         }
-    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
-    tokenizer = model_name
     dpo_train(
            context = ctx,
            train_dataset = train_dataset,
+           eval_dataset = eval_dataset,
            peft_config=True,
            model = model_name,
            tokenizer = tokenizer,
            training_config = training_arguments,
            use_cuda = True,
            beta = 0.1,
-           )
+    )
 
     
 

From b660dd7134437cfcf0bccc60263cabc80d37eea4 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 05:00:57 +0000
Subject: [PATCH 26/33] fmt

---
 huggingface_dpo/huggingface_dpo_trainer.py    | 56 +++++++++++------
 .../test_huggingface_dpo_trainer.py           | 61 +++++++++----------
 2 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py
index 823f83148..1f5154a7b 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.py
+++ b/huggingface_dpo/huggingface_dpo_trainer.py
@@ -32,14 +32,20 @@
 from mlrun.frameworks._common import CommonTypes, MLRunInterface
 from mlrun.utils import logger
 from trl import DPOTrainer
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
+from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
 from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, 
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
 
 
 class ConfigKeys:
@@ -239,8 +245,8 @@ def log_metric_plot(self, name: str, scores: List[float]):
         )
 
         # Create the plotly artifact:
-        if '/' in name:
-            name = '_'.join(name.split('/'))
+        if "/" in name:
+            name = "_".join(name.split("/"))
         artifact_name = f"{name}_plot"
         artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
         self._artifacts[artifact_name] = self._context.log_artifact(artifact)
@@ -308,8 +314,15 @@ def _print_trainable_parameters(model):
 PEFT_CONFIG = peft.LoraConfig(
     r=16,
     lora_alpha=16,
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
-        "gate_proj", "up_proj", "down_proj"],
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
     lora_dropout=0.05,
     bias="none",
     task_type="CAUSAL_LM",
@@ -384,6 +397,8 @@ def _get_class_object(class_path: str) -> type:
     module_path, class_name = class_path.rsplit(".", 1)
     module = importlib.import_module(module_path)
     return getattr(module, class_name)
+
+
 def _set_model_and_tokenizer(
     model: Union[str, List[str]],
     tokenizer: Union[str, List[str]],
@@ -488,6 +503,7 @@ def _set_model_and_tokenizer(
 
     return model_name, model, tokenizer
 
+
 def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
     """
     loads the specific dataset provided by the user
@@ -561,7 +577,6 @@ def _prepare_dataset(
             logger.error("train dataset is mandatory")
             raise KeyError("no train dataset found in given dataset")
 
-
     return train_dataset, eval_dataset
 
 
@@ -579,8 +594,8 @@ def dpo_train(
     beta: Union[float, bool] = False,
     training_config: dict = {},
     model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {}, 
-    data_collator_config : dict={},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
     task: str = "text-generation",
     use_cuda: bool = True,
     framework: str = "pt",
@@ -626,7 +641,6 @@ def dpo_train(
     }
     _update_config(dst=configs, src=kwargs)
 
-
     # check gpu permission and availability
     if use_cuda:
         if torch.cuda.is_available():
@@ -640,15 +654,17 @@ def dpo_train(
         model=model,
         tokenizer=tokenizer,
         framework=framework,
-        task = task,
+        task=task,
         quantization_config=configs[ConfigKeys.quantization],
         use_cuda=use_cuda,
         tokenizer_pretrained_config=tokenizer_pretrained_config,
         model_pretrained_config=configs[ConfigKeys.model_pretrained],
         device_map=device_map,
     )
-    train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs)
-    
+    train_dataset, eval_dataset = _prepare_dataset(
+        train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs
+    )
+
     # Initialize training kwargs from user kwargs:
     train_kwargs = configs[ConfigKeys.training]
 
@@ -667,11 +683,11 @@ def dpo_train(
 
     trainer = DPOTrainer(
         model=model,
-        ref_model = None,
+        ref_model=None,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         peft_config=configs[ConfigKeys.peft_config],
-        beta = configs[ConfigKeys.beta],
+        beta=configs[ConfigKeys.beta],
         tokenizer=tokenizer,
         args=training_args,
         max_length=2048,
diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 64ec36886..f073aafb5 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -17,46 +17,46 @@
 
 import mlrun
 
+
 def test_dpo_fn():
     model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name
-    #dop_trainer = mlrun.import_function("function.yaml")
+    # dop_trainer = mlrun.import_function("function.yaml")
 
-    ctx = mlrun.get_or_create_ctx(name='test_dpo')
+    ctx = mlrun.get_or_create_ctx(name="test_dpo")
     train_dataset = "unalignment/toxic-dpo-v0.2"
     eval_dataset = "unalignment/toxic-dpo-v0.2"
     training_arguments = {
-            "evaluation_strategy": "steps",
-            "do_eval": False,
-            "optim": "paged_adamw_8bit",
-            "per_device_train_batch_size": 1,
-            "gradient_accumulation_steps": 4,
-            "per_device_eval_batch_size": 1,
-            "log_level": "info",
-            "save_steps": 5,
-            "learning_rate": 5e-7,
-            "eval_steps": 1,
-            "num_train_epochs": 1,
-            "max_steps": 5,
-            "warmup_steps": 5,
-            "fp16": True,
-            "lr_scheduler_type": "cosine",
-            "remove_unused_columns": True,
-            "gradient_checkpointing": True,
-        }
+        "evaluation_strategy": "steps",
+        "do_eval": False,
+        "optim": "paged_adamw_8bit",
+        "per_device_train_batch_size": 1,
+        "gradient_accumulation_steps": 4,
+        "per_device_eval_batch_size": 1,
+        "log_level": "info",
+        "save_steps": 5,
+        "learning_rate": 5e-7,
+        "eval_steps": 1,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 5,
+        "fp16": True,
+        "lr_scheduler_type": "cosine",
+        "remove_unused_columns": True,
+        "gradient_checkpointing": True,
+    }
     dpo_train(
-           context = ctx,
-           train_dataset = train_dataset,
-           eval_dataset = eval_dataset,
-           peft_config=True,
-           model = model_name,
-           tokenizer = tokenizer,
-           training_config = training_arguments,
-           use_cuda = True,
-           beta = 0.1,
+        context=ctx,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        peft_config=True,
+        model=model_name,
+        tokenizer=tokenizer,
+        training_config=training_arguments,
+        use_cuda=True,
+        beta=0.1,
     )
 
-    
 
 def test_dpo_train():
 
@@ -81,7 +81,6 @@ def test_dpo_train():
         "training_config": training_arguments,
         "dataset_columns_to_train": "quote",
         "model_pretrained_config": {"use_cache": False},
-
         "use_cuda": False,
     }
 

From 3fe14517bead761f82e68ca7e8e07940a422c48e Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 05:13:27 +0000
Subject: [PATCH 27/33] update the function yaml file

---
 huggingface_dpo/function.yaml | 45 ++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml
index d0baab33a..c3593fa63 100644
--- a/huggingface_dpo/function.yaml
+++ b/huggingface_dpo/function.yaml
@@ -2,7 +2,7 @@ kind: job
 metadata:
   name: huggingface-dpo-trainer
   tag: ''
-  hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676
+  hash: 584b20584f58bfa89225b6999e6b55ad017dd87a
   project: ''
   labels:
     author: pgw
@@ -14,7 +14,7 @@ spec:
   args: []
   image: mlrun/mlrun
   build:
-    functionSourceCode: aW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgKExvcmFDb25maWcsIFBlZnRNb2RlbCwgZ2V0X3BlZnRfbW9kZWwsCiAgICAgICAgICAgICAgICAgIHByZXBhcmVfbW9kZWxfZm9yX2tiaXRfdHJhaW5pbmcpCmZyb20gcGxvdGx5IGltcG9ydCBncmFwaF9vYmplY3RzIGFzIGdvCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCAoQXV0b01vZGVsRm9yQ2F1c2FsTE0sIEF1dG9Ub2tlbml6ZXIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgQml0c0FuZEJ5dGVzQ29uZmlnLCBEYXRhQ29sbGF0b3JGb3JMYW5ndWFnZU1vZGVsaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgIFByZVRyYWluZWRNb2RlbCwgUHJlVHJhaW5lZFRva2VuaXplciwgVHJhaW5lciwKICAgICAgICAgICAgICAgICAgICAgICAgICBUcmFpbmVyQ2FsbGJhY2ssIFRyYWluZXJDb250cm9sLCBUcmFpbmVyU3RhdGUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgVHJhaW5pbmdBcmd1bWVudHMpCgpzdXBwb3J0ZWRfdGFza3MgPSBbCiAgICAicXVlc3Rpb24tYW5zd2VyaW5nIiwKICAgICJzdW1tYXJpemF0aW9uIiwKICAgICJ0YWJsZS1xdWVzdGlvbi1hbnN3ZXJpbmciLAogICAgInRleHQydGV4dC1nZW5lcmF0aW9uIiwKICAgICJ0ZXh0LWNsYXNzaWZpY2F0aW9uIiwKICAgICJzZW50aW1lbnQtYW5hbHlzaXMiLAogICAgInRleHQtZ2VuZXJhdGlvbiIsCiAgICAidG9rZW4tY2xhc3NpZmljYXRpb24iLAogICAgInRyYW5zbGF0aW9uIiwKICAgICJ0cmFuc2xhdGlvbl94eF90b195eSIsCl0KCgpjbGFzcyBDb25maWdLZXlzOgogICAgZGVlcHNwZWVkID0gImRlZXBzcGVlZCIKICAgIHF1YW50aXphdGlvbiA9ICJxdWFudGl6YXRpb24iCiAgICB0cmFpbmluZyA9ICJ0cmFpbmluZyIKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkID0gInRva2VuaXplcl9wcmV0cmFpbmVkIgogICAgbW9kZWxfcHJldHJhaW5lZCA9ICJtb2RlbF9wcmV0cmFpbmVkIgogICAgcGVmdF9jb25maWcgPSAicGVmdF9jb25maWciCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiB0cmwuRFBPVHJhaW5lciwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0YWc6IHN0ciA9ICIiLAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHggPSBOb25lLAogICAgYXV0b19sb2c6IGJvb2wgPSBUcnVlLAogICAgbGFiZWxzOiBEaWN0W3N0ciwgc3RyXSA9IE5vbmUsCiAgICBleHRyYV9kYXRhOiBkaWN0ID0gTm9uZSwKICAgICoqa3dhcmdzLAopOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgIiIiCiAgICAjIEdldCBwYXJhbWV0ZXJzIGRlZmF1bHRzOgogICAgaWYgY29udGV4dCBpcyBOb25lOgogICAgICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5ERUZBVUxUX0NPTlRFWFRfTkFNRSkKCiAgICBIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5hZGRfaW50ZXJmYWNlKG9iaj10cmFpbmVyKQoKICAgIGlmIGF1dG9fbG9nOgogICAgICAgIHRyYWluZXIuYWRkX2NhbGxiYWNrKAogICAgICAgICAgICBNTFJ1bkNhbGxiYWNrKAogICAgICAgICAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgICAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgICAgICAgICAgdGFnPXRhZywKICAgICAgICAgICAgICAgIGxhYmVscz1sYWJlbHMsCiAgICAgICAgICAgICAgICBleHRyYV9kYXRhPWV4dHJhX2RhdGEsCiAgICAgICAgICAgICkKICAgICAgICApCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZW5kIGZyb20gTUxSVU4tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQoKCmRlZiBfcHJpbnRfdHJhaW5hYmxlX3BhcmFtZXRlcnMobW9kZWwpOgogICAgIiIiCiAgICBQcmludHMgdGhlIG51bWJlciBvZiB0cmFpbmFibGUgcGFyYW1ldGVycyBpbiB0aGUgbW9kZWwuCiAgICAiIiIKICAgIHRyYWluYWJsZV9wYXJhbXMgPSAwCiAgICBhbGxfcGFyYW0gPSAwCiAgICBmb3IgXywgcGFyYW0gaW4gbW9kZWwubmFtZWRfcGFyYW1ldGVycygpOgogICAgICAgIGFsbF9wYXJhbSArPSBwYXJhbS5udW1lbCgpCiAgICAgICAgaWYgcGFyYW0ucmVxdWlyZXNfZ3JhZDoKICAgICAgICAgICAgdHJhaW5hYmxlX3BhcmFtcyArPSBwYXJhbS5udW1lbCgpCiAgICBwcmludCgKICAgICAgICBmInRyYWluYWJsZSBwYXJhbXM6IHt0cmFpbmFibGVfcGFyYW1zfSB8fCBhbGwgcGFyYW1zOiB7YWxsX3BhcmFtfSB8fCB0cmFpbmFibGUlOiIKICAgICAgICBmIiB7MTAwICogdHJhaW5hYmxlX3BhcmFtcyAvIGFsbF9wYXJhbX0iCiAgICApCgoKIyBkZWZhdWx0IGNvbmZpZ3MKIyB3aWxsIGJlIHVzZWQgaWYgdXNlciBwcm92aWRlcyAiVHJ1ZSIgd2l0aCBjb25maWcgbmFtZSBhcyBpbnB1dApRVUFOVElaQVRJT05fQ09ORklHID0gdHJhbnNmb3JtZXJzLkJpdHNBbmRCeXRlc0NvbmZpZygKICAgIGxvYWRfaW5fNGJpdD1UcnVlLAogICAgYm5iXzRiaXRfdXNlX2RvdWJsZV9xdWFudD1UcnVlLAogICAgYm5iXzRiaXRfcXVhbnRfdHlwZT0ibmY0IiwKICAgIGJuYl80Yml0X2NvbXB1dGVfZHR5cGU9dG9yY2guYmZsb2F0MTYsCikKClBFRlRfQ09ORklHID0gcGVmdC5Mb3JhQ29uZmlnKAogICAgcj04LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsicV9wcm9qIiwgImtfcHJvaiIsICJ2X3Byb2oiLCAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwgInVwX3Byb2oiLCAiZG93bl9wcm9qIl0sCiAgICBsb3JhX2Ryb3BvdXQ9MC4wNSwKICAgIGJpYXM9Im5vbmUiLAogICAgdGFza190eXBlPSJDQVVTQUxfTE0iLAopCgpERUVQU1BFRURfQ09ORklHID0gewogICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICJhdXRvIiwKICAgICJmcDE2IjogeyJlbmFibGVkIjogVHJ1ZX0sCiAgICAiYXV0b3R1bmluZyI6IHsKICAgICAgICAiZW5hYmxlZCI6IFRydWUsCiAgICAgICAgImFyZ19tYXBwaW5ncyI6IHsKICAgICAgICAgICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICItLXBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSIsCiAgICAgICAgICAgICJncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMgIjogIi0tZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzIiwKICAgICAgICB9LAogICAgfSwKICAgICJ6ZXJvX29wdGltaXphdGlvbiI6IHsKICAgICAgICAic3RhZ2UiOiAyLAogICAgfSwKfQoKCmRlZiBfdXBkYXRlX2NvbmZpZyhzcmM6IGRpY3QsIGRzdDogZGljdCk6CiAgICAiIiIKICAgIHVwZGF0ZSBjb25maWdzIGFjY29yZGluZyB0byB1c2VyLCB0aGlzIHdheSB0aGUgdXNlciBjYW4gYWRkL21vZGlmeSB2YWx1ZXMgaW4gZGVmYXVsdCBjb25maWdzIGZvciBlLmcuCgogICAgZ29lcyBvdmVyIGFsbCBjb25maWdzIGFuZCBjb3JyZXNwb25kaW5nIHByZWZpeGVzLCBjb2xsZWN0IGFsbCB0aGUga2V5cyBmcm9tIHRoZSBnaXZlbiBkaWN0IHRoYXQgc3RhcnQKICAgICB3aXRoIHRoZSBwcmVmaXggYW5kIGFkZCB0aGVtIHRvIGFwcHJvcHJpYXRlIGNvbmZpZwoKICAgIDpwYXJhbSBzcmM6IGRpY3Qgb2YgYWxsIGNhbmRpZGF0ZSB2YWx1ZXMgdG8gdXBkYXRlIGRpY3QuCiAgICA6cGFyYW0gZHN0OiBkaWN0IGNvbnRhaW5pbmcgYWxsIGNvbmZpZ3MgdG8gdXBkYXRlLgogICAgIiIiCgogICAgZm9yIGNvbmZpZ19uYW1lLCBjb25maWcgaW4gZHN0Lml0ZW1zKCk6CgogICAgICAgICMgSWYgZ2l2ZW4gVHJ1ZSB3ZSB1c2UgZGVmYXVsdCBkaWN0CiAgICAgICAgIyBDYW4gYWxzbyBiZSBGYWxzZSBvciBhIGNvbmZpZyBkaWN0IGdpdmVuIGZyb20gdXNlciwgc28gd2UgY2hlY2sgc3BlY2lmaWNhbGx5IGZvIFRydWUKICAgICAgICBpZiBjb25maWcgaXMgVHJ1ZSBhbmQgY29uZmlnX25hbWUgPT0gInF1YW50aXphdGlvbiI6CiAgICAgICAgICAgIGNvbmZpZyA9IFFVQU5USVpBVElPTl9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJsb3JhIjoKICAgICAgICAgICAgY29uZmlnID0gUEVGVF9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJkZWVwc3BlZWQiOgogICAgICAgICAgICBjb25maWcgPSBERUVQU1BFRURfQ09ORklHCgogICAgICAgICMgaW4gc29tZSBjYXNlcyB3ZSBjYW4gZ2V0IGEgYm9vbGVhbiB2YWx1ZSwgaW4gdGhhdCBjYXNlIG5vIG5lZWQgdG8gbG9vayBmb3IgYXJncwogICAgICAgIGlmIGlzaW5zdGFuY2UoY29uZmlnLCBib29sKToKICAgICAgICAgICAgY29uZmlnID0gTm9uZQoKICAgICAgICBlbGlmIGlzaW5zdGFuY2UoY29uZmlnLCBkaWN0KToKICAgICAgICAgICAgZm9yIGtleSwgdmFsIGluIHNyYy5pdGVtcygpOgogICAgICAgICAgICAgICAgaWYga2V5LnN0YXJ0c3dpdGgoY29uZmlnX25hbWUpOgogICAgICAgICAgICAgICAgICAgIGNvbmZpZ1trZXkucmVwbGFjZShmIntjb25maWdfbmFtZX1fIiwgIiIpXSA9IHZhbAoKICAgICAgICAjIHVwZGF0ZSBieSBjb25maWcgbmFtZQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBzZXRhdHRyKGNvbmZpZywga2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKSwgdmFsKQoKICAgICAgICBkc3QudXBkYXRlKHtjb25maWdfbmFtZTogY29uZmlnfSkKCgpkZWYgX2dldF9jbGFzc19vYmplY3QoY2xhc3NfcGF0aDogc3RyKSAtPiB0eXBlOgogICAgIiIiCiAgICBnaXZlbiBhIGZ1bGwgY2xhc3MgbmFtZSwgdGhpcyBmdW5jdGlvbiByZXR1cm5zIHRoZSBjb3JyZWN0IGNsYXNzCgogICAgOnBhcmFtIGNsYXNzX3BhdGg6IGEgZnVsbCBjbGFzcyBuYW1lIChleC4gJ3RyYW5zZm9ybWVycy5BdXRvTW9kZWxGb3JDYXVzYWxMTScpCgogICAgOnJldHVybiB0aGUgd2FudGVkIGNsYXNzIG9iamVjdAogICAgIiIiCiAgICBtb2R1bGVfcGF0aCwgY2xhc3NfbmFtZSA9IGNsYXNzX3BhdGgucnNwbGl0KCIuIiwgMSkKICAgIG1vZHVsZSA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKG1vZHVsZV9wYXRoKQogICAgcmV0dXJuIGdldGF0dHIobW9kdWxlLCBjbGFzc19uYW1lKQoKCmRlZiBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICBtb2RlbDogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0sCiAgICB0YXNrOiBzdHIsCiAgICBmcmFtZXdvcms6IHN0ciwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IGRpY3QsCiAgICB1c2VfY3VkYTogYm9vbCwKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgIG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgZGV2aWNlX21hcDogc3RyLAopOgogICAgIiIiCiAgICBnZXQgdGhlIGNvcnJlY3QgbW9kZWwgYW5kIHRva2VuaXplciBhY2NvcmRpbmcgdG8gZ2l2ZW4gdXNlciBpbnB1dHMKCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdGFzazogYSBzdXBwb3J0ZWQgbmxwIHRhc2ssIHVzZWQgdG8gY2hvb3NlIG1vZGVsIGlmIG5vdCBwcm92aWRlZAogICAgOnBhcmFtIGZyYW1ld29yazogcHQgb3IgdGYKICAgIDpwYXJhbSBxdWFudGl6YXRpb25fY29uZmlnOiBxdWFudGl6YXRpb24gY29uZmlnIG9yIE5vbmUsIHRvIGxvYWQgbW9kZWwgaW4gYXBwcm9wcmlhdGUgd2F5CiAgICA6cGFyYW0gdXNlX2N1ZGE6IHVzZSBncHUgb3Igbm90CiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkZXZpY2VfbWFwOiBhIGRldmljZSBtYXAgZm9yIG1vZGVsIHRyYWluaW5nIGlmIHVzaW5nIG51bWJlciBvZiBncHUncwoKICAgIDpyZXR1cm5zOiBtb2RlbCBhbmQgdG9rZW5pemVyCiAgICAiIiIKICAgICMgaWYgdGFzayBpcyBub3Qgc3VwcG9ydGVkIGFuZCBubyBtb2RlbCB3YXMgZ2l2ZW4gd2UgY2FuJ3QgY2hvb3NlIG9uZQogICAgaWYgdGFzayBhbmQgdGFzayBub3QgaW4gc3VwcG9ydGVkX3Rhc2tzIGFuZCBub3QgbW9kZWw6CiAgICAgICAgbG9nZ2VyLmVycm9yKCJ1bnN1cHBvcnRlZCB0YXNrIG9wdGlvbiBjaG9zZW4iKQogICAgICAgIHJhaXNlCgogICAgIyBsb2FkIG1vZGVsIGZyb20gc3RvcmUKICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cikgYW5kIGlzX3N0b3JlX3VyaShtb2RlbCk6CiAgICAgICAgcGFzcwogICAgICAgICMgVE9ETzogbG9hZCBib3RoIG1vZGVsIGFuZCB0b2tlbml6ZXIgYW5kIHJldHVybiwgbmVlZCBndXkncyBoZWxwCgogICAgIyBpZiBpdCdzIGEgdHVwbGUgdGhlbSB3ZSBhc3N1bWUgaXQgY29udGFpbnMgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgbGlzdCk6CiAgICAgICAgbW9kZWxfbmFtZSwgbW9kZWxfY2xhc3MgPSBtb2RlbAogICAgICAgIG1vZGVsX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QobW9kZWxfY2xhc3MpCgogICAgIyBpbiB0aGUgY2FzZSB3ZSBkb24ndCBnZXQgdGhlIG1vZGVsIGNsYXNzIHdlIG5lZWQgdGhlIHRhc2sgaW4gb3JkZXIgdG8gY2hvb3NlIHRoZSBjb3JyZWN0IG1vZGVsCiAgICBlbHNlOgogICAgICAgIGlmIHRhc2sgaXMgTm9uZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0YXNrIG11c3QgYmUgY2hvc2VuIGluIG9yZGVyIHRvIGRldGVybWluZSB0aGUgY29ycmVjdCBtb2RlbCIpCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBfLCBhdmFpbGFibGVfY2xhc3NlcywgdGFza19vcHRpb25zID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5jaGVja190YXNrKHRhc2spCgogICAgICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cik6CiAgICAgICAgICAgIG1vZGVsX25hbWUgPSBtb2RlbAoKICAgICAgICAjIGlmIG1vZGVsIGlzIG5vdCBnaXZlbiwgd2UgdGFrZSB0aGUgZGVmYXVsdCBtb2RlbCBmb3IgdGhlIGdpdmVuIHRhc2sKICAgICAgICBlbHNlOgogICAgICAgICAgICBtb2RlbF9uYW1lLCBfID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5nZXRfZGVmYXVsdF9tb2RlbF9hbmRfcmV2aXNpb24oCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfY2xhc3NlcywgZnJhbWV3b3JrLCB0YXNrX29wdGlvbnMKICAgICAgICAgICAgKQogICAgICAgIGlmIG5vdCBhdmFpbGFibGVfY2xhc3Nlcy5nZXQoZnJhbWV3b3JrLCB0dXBsZSgpKToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKAogICAgICAgICAgICAgICAgImdpdmVuIHRhc2sncyBkZWZhdWx0IG1vZGVsIGlzIG5vdCBzdXBwb3J0ZWQgaW4gc3BlY2lmaWVkIGZyYW1ld29yayIKICAgICAgICAgICAgKQogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oCiAgICAgICAgICAgICAgICAidGhpcyBmdW5jdGlvbiByZXF1aXJlcyBlaXRoZXIgYSBzdXBwb3J0ZWQgdGFzayBvciBhIG1vZGVsIGFuZCBtb2RlbCBjbGFzcyB0byBiZSBjaG9zZW4iCiAgICAgICAgICAgICkKCiAgICAgICAgbW9kZWxfY2xhc3MgPSBhdmFpbGFibGVfY2xhc3Nlc1tmcmFtZXdvcmtdWzBdCgogICAgIyBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICBpZiB1c2VfY3VkYToKICAgICAgICBkZXZpY2VfbWFwID0gZGV2aWNlX21hcAogICAgZWxzZToKICAgICAgICBkZXZpY2VfbWFwID0gTm9uZQoKICAgIG1vZGVsID0gbW9kZWxfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsCiAgICAgICAgcXVhbnRpemF0aW9uX2NvbmZpZz1xdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICAgICAqKm1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgKQoKICAgICMgSWYgcXVhbnRpemF0aW9uIGNvbmZpZyBpcyBnaXZlbiB3ZSB3aWxsIGxvYWQgYSBxdWFudGl6ZWQgbW9kZWwsIGlmIG5vdCBhIHJlZ3VsYXIgb25lCiAgICBpZiBxdWFudGl6YXRpb25fY29uZmlnOgogICAgICAgIG1vZGVsLmdyYWRpZW50X2NoZWNrcG9pbnRpbmdfZW5hYmxlKCkKICAgICAgICBtb2RlbCA9IHBlZnQucHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZyhtb2RlbCkKCiAgICAjIGlmIG5vdCBzcGVjaWZpZWQgd2UgY2hvb3NlIHRoZSBkZWZhdWx0IHRva2VuaXplciB0aGF0IGNvcnJlc3BvbmRpbmcgdG8gdGhlIG1vZGVsCiAgICBpZiB0b2tlbml6ZXIgaXMgTm9uZToKICAgICAgICB0b2tlbml6ZXIgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQobW9kZWxfbmFtZSkKICAgICAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKICAgIGlmIGlzaW5zdGFuY2UodG9rZW5pemVyLCBzdHIpOgogICAgICAgIHRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gdHJhbnNmb3JtZXJzLkF1dG9Ub2tlbml6ZXIKCiAgICAjIGlmIGl0J3Mgbm90IGEgc3RyIHRoZW4gaXQncyBhIHR1cGxlIG9mIGJvdGggbmFtZSBhbmQgY2xhc3MKICAgIGVsc2U6CiAgICAgICAgdG9rZW5pemVyX25hbWUsIHRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcgogICAgICAgIHRva2VuaXplcl9jbGFzcyA9IF9nZXRfY2xhc3Nfb2JqZWN0KHRva2VuaXplcl9jbGFzcykKCiAgICB0b2tlbml6ZXIgPSB0b2tlbml6ZXJfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIHRva2VuaXplcl9uYW1lLCAqKnRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZwogICAgKQoKICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgcmV0dXJuIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIKCgpkZWYgX2RhdGFzZXRfbG9hZGVyKGRhdGFzZXQ6IHN0ciwgaXNfdHJhaW46IGJvb2wgPSBUcnVlLCAqKmt3YXJncykgLT4gRGF0YXNldDoKICAgICIiIgogICAgbG9hZHMgdGhlIHNwZWNpZmljIGRhdGFzZXQgcHJvdmlkZWQgYnkgdGhlIHVzZXIKCiAgICA6cGFyYW0gZGF0YXNldDogbmFtZSBvciBwYXRoIG9mIGRhdGFzZXQgdG8gbG9hZAogICAgOnBhcmFtIGlzX3RyYWluOiBib29sIHRoYXQgaW5kaWNhdGVzIHRoZSBwdXJwb3NlIG9mIHRoZSBkYXRhc2V0CiAgICA6cGFyYW0ga3dhcmdzOiBvdGhlciBrd2FyZ3MgZm9yIGxvYWRpbmcgdGhlIGRhdGFzZXQKCiAgICA6cmV0dXJuczogbG9hZGVkIGRhdGFzZXQKICAgICIiIgogICAgIyBpZiBzcGxpdCBpbiBrd2FyZ3MgdGhlbiB0aGUgdXNlciBkZWNpZGVzIGhvdyB0byBzcGxpdCB0aGUgZGF0YXNldAogICAgaWYgInNwbGl0IiBpbiBrd2FyZ3M6CiAgICAgICAgcmV0dXJuIGxvYWRfZGF0YXNldChkYXRhc2V0LCAqKmt3YXJncykKCiAgICAjIGlmIGl0J3MgYSBkYXRhc2V0IGZvciB0cmFpbiB3ZSBzcGxpdCB3aXRoIHRyYWluCiAgICBpZiBpc190cmFpbjoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsIHNwbGl0PSJ0cmFpbiIsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBldmFsIGRhdGFzZXQsIHRoZW4gYSBsb3Qgb2YgbmFtZXMgYXJlIGFjY2VwdGFibGUgZm9yIHRoZSBzZXQgYW5kIHdlIGNoZWNrIGFsbCBvZiB0aGVtCiAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQogICAgaWYgInRlc3QiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgIGVsaWYgImV2YWwiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJldmFsIikKICAgIGVsaWYgInZhbGlkYXRpb24iIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIHRva2VuaXplciwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSwKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogd2hpY2ggY29sdW1ucyB0byBwYXNzIHRvIHRoZSBtb2RlbCBhcyBpbnB1dHMKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIChuZWVkIHRvIHBhc3MgdGhyb3VnaCB0aGUgdG9rZW5pemVyIGZpcnN0KQogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRva2VuaXplcjogdGhlIHRva2VuaXplciB0byBwYXNzIHRoZSBkYXRhIHRocm91Z2gKCiAgICA6cmV0dXJuczogdG9rZW5pemVkIGRhdGFzZXRzCiAgICAiIiIKICAgIGlmIG5vdCB0b2tlbml6ZXIucGFkX3Rva2VuOgogICAgICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgIyB3ZSB0YWtlIGNvbCBuYW1lL3MgaW4gYSBsaXN0IGZvciBlYXN5IGdlbmVyYWxpemF0aW9uCiAgICBpZiBpc2luc3RhbmNlKGRhdGFzZXRfY29sdW1uc190b190cmFpbiwgc3RyKToKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW4gPSBbZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXQoKICAgIGlmIGlzaW5zdGFuY2UodHJhaW5fZGF0YXNldCwgbWxydW4uZGF0YXN0b3JlLkRhdGFJdGVtKToKICAgICAgICB0cmFpbl9kYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyh0cmFpbl9kYXRhc2V0LmFzX2RmKCkpCiAgICAgICAgcmV0dXJuICgKICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICBsYW1iZGEgZXhhbXBsZXM6IHRva2VuaXplcigKICAgICAgICAgICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICBwYWRkaW5nPVRydWUsCiAgICAgICAgICAgICAgICApLAogICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICApLAogICAgICAgICAgICBOb25lLAogICAgICAgICkKCiAgICAjIExvYWQgZGF0YXNldHMKICAgICMgaWYgcHJvdmlkZWQgdHdvIHBhdGhzL25hbWVzIHdlIGxvYWQgZWFjaCBzZXBhcmF0ZWx5IHVzaW5nIGRlc2lnbmF0ZWQgZnVuYwogICAgaWYgZXZhbF9kYXRhc2V0OgogICAgICAgIHRyYWluX2RhdGFzZXQgPSBfZGF0YXNldF9sb2FkZXIoCiAgICAgICAgICAgIGRhdGFzZXQ9dHJhaW5fZGF0YXNldCwgaXNfdHJhaW49VHJ1ZSwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgICAgIGV2YWxfZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD1ldmFsX2RhdGFzZXQsIGlzX3RyYWluPUZhbHNlLCAqKmV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncwogICAgICAgICkKCiAgICAjIGlmIG9ubHkgb24gcGF0aCBpcyBnaXZlbiB0aGVuIHdlIG11c3QgY2hlY2sgaWYgaXQgY29udGFpbnMgYm90aCBkYXRhc2V0IG9yIGlmIG9ubHkgb25lIHNob3VsZCBiZSB1c2VkCiAgICBlbHNlOgogICAgICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQodHJhaW5fZGF0YXNldCwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzKQogICAgICAgIGlmICJ0cmFpbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgdHJhaW5fZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0cmFpbiIpCiAgICAgICAgICAgIGlmICJ0ZXN0IiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInRlc3QiKQogICAgICAgICAgICBlbGlmICJldmFsIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoImV2YWwiKQogICAgICAgICAgICBlbGlmICJ2YWxpZGF0aW9uIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgIyBvbmx5IHRyYWluIGRhdGFzZXQgZ2l2ZW4sIHRva2VuaXplIGFuZCByZXR1cm4gaXQKICAgICAgICAgICAgICAgIHJldHVybiAoCiAgICAgICAgICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICAgICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAgICAgICAgICAgICAgICAgKltleGFtcGxlc1tjb2xdIGZvciBjb2wgaW4gZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgKSwKICAgICAgICAgICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICAgICAgICAgICksCiAgICAgICAgICAgICAgICAgICAgTm9uZSwKICAgICAgICAgICAgICAgICkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRyYWluIGRhdGFzZXQgaXMgbWFuZGF0b3J5IikKICAgICAgICAgICAgcmFpc2UgS2V5RXJyb3IoIm5vIHRyYWluIGRhdGFzZXQgZm91bmQgaW4gZ2l2ZW4gZGF0YXNldCIpCgogICAgIyBUb2tlbml6ZSB0aGUgZGF0YSBzbyB0aGUgbW9kZWwgY2FuIHVuZGVyc3RhbmQgaXQKICAgIHRva2VuaXplZF90cmFpbl9kYXRhc2V0ID0gdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgbGFtYmRhIGV4YW1wbGVzOiB0b2tlbml6ZXIoCiAgICAgICAgICAgICpbZXhhbXBsZXNbY29sXSBmb3IgY29sIGluIGRhdGFzZXRfY29sdW1uc190b190cmFpbl0sCiAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgcGFkZGluZz1UcnVlLAogICAgICAgICksCiAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgKQoKICAgIHRva2VuaXplZF9ldmFsX2RhdGFzZXQgPSBldmFsX2RhdGFzZXQubWFwKAogICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICB0cnVuY2F0aW9uPVRydWUsCiAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICApLAogICAgICAgIGJhdGNoZWQ9VHJ1ZSwKICAgICkKCiAgICByZXR1cm4gdG9rZW5pemVkX3RyYWluX2RhdGFzZXQsIHRva2VuaXplZF9ldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSA9ICJ0ZXh0IiwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRmluZS10dW5lcyBhIExhbmd1YWdlIE1vZGVsIChMTE0pIG9uIGEgc3BlY2lmaWMgdGFzayB1c2luZyB0aGUgcHJvdmlkZWQgZGF0YXNldC4KICAgICBUaGUgZnVuY3Rpb24gdGFrZXMgdmFyaW91cyBjb25maWd1cmF0aW9uIHBhcmFtZXRlcnMgdG8gY3VzdG9taXplIHRoZSB0cmFpbmluZyBwcm9jZXNzCiAgICAgYW5kIGFkYXB0IHRoZSBtb2RlbCB0byBzcGVjaWZpYyB0YXNrcyB1c2luZyBhIHByb3ZpZGVkIGRhdGFzZXQuCgogICAgOnBhcmFtIGNvbnRleHQ6IG1scnVuIGNvbnRleHQgaW4gb3JkZXIgdG8gbG9nIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW46IHdoaWNoIGNvbHVtbnMgdG8gcGFzcyB0byB0aGUgbW9kZWwgYXMgaW5wdXRzCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZnJhbWV3b3JrOiBwdCBvdCB0ZgogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdHJhaW5fZGF0YXNldDogVGhlIHRyYWluIGRhdGFzZXQgdXNlZCBmb3IgZmluZS10dW5pbmcgdGhlIGxhbmd1YWdlIG1vZGVsLgogICAgOnBhcmFtIGV2YWxfZGF0YXNldDogVGhlIGV2YWwgZGF0YXNldCB1c2VkIGZvciBldmFsdWF0ZSB0aGUgbGFuZ3VhZ2UgbW9kZWwgZHVyaW5nIHRyYWluaW5nLgogICAgOnBhcmFtIGRlZXBzcGVlZF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgRGVlcFNwZWVkIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBtb2RlbCBxdWFudGl6YXRpb24gKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBsb3JhX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBMb3ctUmFuayBBcHByb3hpbWF0aW9uIChMb1JBKSAob3B0aW9uYWwpLgogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gZGF0YV9jb2xsYXRvcl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgZGF0YSBjb2xsYXRpb24gZHVyaW5nIHRyYWluaW5nIChvcHRpb25hbCkuCiAgICA6cGFyYW0gdGFzazogQSBkZXNjcmlwdGlvbiBvZiB0aGUgc3BlY2lmaWMgdGFzayB0aGUgbW9kZWwgaXMgYmVpbmcgZmluZS10dW5lZCBmb3IuCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBUT0RPOiBtYXRjaCBmb3J3YXJkLmtleXdvcmQgdG8gZGF0YXNldC5rZXl3b3JkIC0gY2hlY2sgaWYgcmVsZXZhbnQgaW4gbmV3IGRlc2lnbgogICAgIyBUT0RPOiBhZGQgd2FybmluZyBmb3IgbGFiZWwsIGFuZCBhZGQgb3B0aW9uIHRvIG1vZGlmeSBkYXRhc2V0IGNvbCBuYW1lcyAtIGNoZWNrIGlmIHJlbGV2YW50IGluIG5ldyBkZXNpZ24KCiAgICAjIExvb2sgZm9yIHVwZGF0ZXMgdG8gY29uZmlncyBnaXZlbiBpbiBrd2FyZ3MKICAgIGNvbmZpZ3MgPSB7CiAgICAgICAgQ29uZmlnS2V5cy5kZWVwc3BlZWQ6IGRlZXBzcGVlZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5xdWFudGl6YXRpb246IHF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50cmFpbmluZzogdHJhaW5pbmdfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZDogbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50b2tlbml6ZXJfcHJldHJhaW5lZDogdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuZGF0YV9jb2xsYXRvcjogZGF0YV9jb2xsYXRvcl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZzogcGVmdF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5iZXRhOiBiZXRhLAogICAgfQogICAgX3VwZGF0ZV9jb25maWcoZHN0PWNvbmZpZ3MsIHNyYz1rd2FyZ3MpCgogICAgIyBjaGVjayBncHUgcGVybWlzc2lvbiBhbmQgYXZhaWxhYmlsaXR5CiAgICBpZiB1c2VfY3VkYToKICAgICAgICBpZiB0b3JjaC5jdWRhLmlzX2F2YWlsYWJsZSgpOgogICAgICAgICAgICAjIENsZWFuIGdwdSBjYWNoZQogICAgICAgICAgICB0b3JjaC5jdWRhLmVtcHR5X2NhY2hlKCkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIud2FybmluZygiJ3VzZV9jdWRhJyBpcyBzZXQgdG8gVHJ1ZSwgYnV0IG5vIGN1ZGEgZGV2aWNlIGlzIGF2YWlsYWJsZSIpCgogICAgIyBnZXQgbW9kZWwgYW5kIHRva2VuaXplcgogICAgbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplciA9IF9zZXRfbW9kZWxfYW5kX3Rva2VuaXplcigKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHF1YW50aXphdGlvbl9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnF1YW50aXphdGlvbl0sCiAgICAgICAgdXNlX2N1ZGE9dXNlX2N1ZGEsCiAgICAgICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnPXRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZz1jb25maWdzW0NvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZF0sCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgKQoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgdG9rZW5pemVkX3RyYWluLCB0b2tlbml6ZWRfZXZhbCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncz10cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLAogICAgICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncz1ldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW49ZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluLAogICAgKQoKICAgICMgSW5pdGlhbGl6ZSB0aGUgZGF0YSBjb2xsYXRvciBmb3IgdGhlIHRyYWluZXIgdG8gdXNlIGluIG9yZGVyIHRvIGNyZWF0ZSBiYXRjaGVzIG9mIGRhdGEKICAgIGRhdGFfY29sbGF0b3IgPSB0cmFuc2Zvcm1lcnMuRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZygKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLCBtbG09RmFsc2UsICoqZGF0YV9jb2xsYXRvcl9jb25maWcKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IHRybC5EUE9UcmFpbmVyKAogICAgICAgIG1vZGVsPW1vZGVsLAogICAgICAgIHJlZl9tb2RlbCA9IE5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10b2tlbml6ZWRfdHJhaW4sCiAgICAgICAgZXZhbF9kYXRhc2V0PXRva2VuaXplZF9ldmFsLAogICAgICAgIHBlZnRfY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZ10sCiAgICAgICAgYmV0YSA9IGNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGRhdGFfY29sbGF0b3I9ZGF0YV9jb2xsYXRvciwKICAgICAgICBhcmdzPXRyYWluaW5nX2FyZ3MsCiAgICApCgogICAgYXBwbHlfbWxydW4odHJhaW5lciwgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdKQogICAgbW9kZWwuY29uZmlnLnVzZV9jYWNoZSA9ICgKICAgICAgICBGYWxzZSAgIyBzaWxlbmNlIHRoZSB3YXJuaW5ncy4gUGxlYXNlIHJlLWVuYWJsZSBmb3IgaW5mZXJlbmNlIQogICAgKQoKICAgICMgQXBwbHkgdHJhaW5pbmcgd2l0aCBldmFsdWF0aW9uOgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmInRyYWluaW5nICd7bW9kZWxfbmFtZX0nIikKICAgIHRyYWluZXIudHJhaW4oKQoKICAgIHRlbXBfZGlyZWN0b3J5ID0gdGVtcGZpbGUuVGVtcG9yYXJ5RGlyZWN0b3J5KCkubmFtZQogICAgdHJhaW5lci5zYXZlX21vZGVsKHRlbXBfZGlyZWN0b3J5KQoKICAgICMgWmlwIHRoZSBtb2RlbCBkaXJlY3Rvcnk6CiAgICBzaHV0aWwubWFrZV9hcmNoaXZlKAogICAgICAgIGJhc2VfbmFtZT0ibW9kZWwiLAogICAgICAgIGZvcm1hdD0iemlwIiwKICAgICAgICByb290X2Rpcj10ZW1wX2RpcmVjdG9yeSwKICAgICkKCiAgICAjIExvZyB0aGUgbW9kZWw6CiAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICBrZXk9Im1vZGVsIiwKICAgICAgICBkYl9rZXk9bW9kZWxfbmFtZS5zcGxpdCgiLyIpWy0xXSwKICAgICAgICBtb2RlbF9maWxlPSJtb2RlbC56aXAiLAogICAgICAgIHRhZz0iIiwKICAgICAgICBmcmFtZXdvcms9Ikh1Z2dpbmcgRmFjZSIsCiAgICApCgoKZGVmIGV2YWx1YXRlKAogICAgY29udGV4dCwKICAgIG1vZGVsX3BhdGgsCiAgICBkYXRhOiBwZC5EYXRhRnJhbWUsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgdG9rZW5pemVyX25hbWU6IHN0ciA9IE5vbmUsCik6CiAgICAiIiIKICAgIEV2YWx1YXRpbmcgdGhlIG1vZGVsIHVzaW5nIHBlcnBsZXhpdHksIGZvciBtb3JlIGluZm9ybWF0aW9uIHZpc2l0OgogICAgaHR0cHM6Ly9odWdnaW5nZmFjZS5jby9kb2NzL3RyYW5zZm9ybWVycy9wZXJwbGV4aXR5CgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICBtbHJ1biBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGF0aDogIHBhdGggdG8gdGhlIG1vZGVsIGRpcmVjdG9yeQogICAgOnBhcmFtIGRhdGE6ICAgICAgICB0aGUgZGF0YSB0byBldmFsdWF0ZSB0aGUgbW9kZWwKICAgIDpwYXJhbSBtb2RlbF9uYW1lOiAgbmFtZSBvZiBiYXNlIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6IG5hbWUgb2YgYmFzZSB0b2tlbml6ZXIKICAgICIiIgogICAgIyBHZXQgdGhlIG1vZGVsIGFydGlmYWN0IGFuZCBmaWxlOgogICAgKAogICAgICAgIG1vZGVsX2ZpbGUsCiAgICAgICAgbW9kZWxfYXJ0aWZhY3QsCiAgICAgICAgZXh0cmFfZGF0YSwKICAgICkgPSBtbHJ1bi5hcnRpZmFjdHMuZ2V0X21vZGVsKG1vZGVsX3BhdGgpCgogICAgIyBSZWFkIHRoZSBuYW1lOgogICAgX21vZGVsX25hbWUgPSBtb2RlbF9hcnRpZmFjdC5zcGVjLmRiX2tleQoKICAgICMgRXh0cmFjdCBsb2dnZWQgbW9kZWwgZmlsZXM6CiAgICBtb2RlbF9kaXJlY3RvcnkgPSBvcy5wYXRoLmpvaW4ob3MucGF0aC5kaXJuYW1lKG1vZGVsX2ZpbGUpLCBfbW9kZWxfbmFtZSkKICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX2ZpbGU6CiAgICAgICAgemlwX2ZpbGUuZXh0cmFjdGFsbChtb2RlbF9kaXJlY3RvcnkpCgogICAgIyBMb2FkaW5nIHRoZSBzYXZlZCBwcmV0cmFpbmVkIHRva2VuaXplciBhbmQgbW9kZWw6CiAgICBkYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyhkYXRhKQogICAgdG9rZW5pemVyID0gQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQodG9rZW5pemVyX25hbWUpCiAgICBwYWRfdG9rZW5faWQgPSB0b2tlbml6ZXIuZW9zX3Rva2VuX2lkCiAgICBtb2RlbCA9IEF1dG9Nb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCgKICAgICAgICBtb2RlbF9uYW1lLCBkZXZpY2VfbWFwPSJjdWRhOjAiLCB0cnVzdF9yZW1vdGVfY29kZT1UcnVlLCBsb2FkX2luXzhiaXQ9VHJ1ZQogICAgKQogICAgbW9kZWwgPSBQZWZ0TW9kZWwuZnJvbV9wcmV0cmFpbmVkKG1vZGVsLCBtb2RlbF9kaXJlY3RvcnkpCiAgICBtb2RlbC5ldmFsKCkKICAgIGVuY29kaW5ncyA9IHRva2VuaXplcigiXG5cbiIuam9pbihkYXRhc2V0WyJ0ZXh0Il1bOjVdKSwgcmV0dXJuX3RlbnNvcnM9InB0IikKCiAgICBtYXhfbGVuZ3RoID0gMTAyNAogICAgc3RyaWRlID0gNTEyCiAgICBzZXFfbGVuID0gZW5jb2RpbmdzLmlucHV0X2lkcy5zaXplKDEpCgogICAgbmxscyA9IFtdCiAgICBwcmV2X2VuZF9sb2MgPSAwCiAgICBmb3IgYmVnaW5fbG9jIGluIHJhbmdlKDAsIHNlcV9sZW4sIHN0cmlkZSk6CiAgICAgICAgZW5kX2xvYyA9IG1pbihiZWdpbl9sb2MgKyBtYXhfbGVuZ3RoLCBzZXFfbGVuKQogICAgICAgIHRyZ19sZW4gPSBlbmRfbG9jIC0gcHJldl9lbmRfbG9jICAjIG1heSBiZSBkaWZmZXJlbnQgZnJvbSBzdHJpZGUgb24gbGFzdCBsb29wCiAgICAgICAgaW5wdXRfaWRzID0gZW5jb2RpbmdzLmlucHV0X2lkc1s6LCBiZWdpbl9sb2M6ZW5kX2xvY10KICAgICAgICB0YXJnZXRfaWRzID0gaW5wdXRfaWRzLmNsb25lKCkKICAgICAgICB0YXJnZXRfaWRzWzosIDotdHJnX2xlbl0gPSAtMTAwCgogICAgICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgICAgICBvdXRwdXRzID0gbW9kZWwoaW5wdXRfaWRzLmN1ZGEoKSwgbGFiZWxzPXRhcmdldF9pZHMpCgogICAgICAgICAgICAjIGxvc3MgaXMgY2FsY3VsYXRlZCB1c2luZyBDcm9zc0VudHJvcHlMb3NzIHdoaWNoIGF2ZXJhZ2VzIG92ZXIgdmFsaWQgbGFiZWxzCiAgICAgICAgICAgICMgTi5CLiB0aGUgbW9kZWwgb25seSBjYWxjdWxhdGVzIGxvc3Mgb3ZlciB0cmdfbGVuIC0gMSBsYWJlbHMsIGJlY2F1c2UgaXQgaW50ZXJuYWxseSBzaGlmdHMgdGhlIGxhYmVscwogICAgICAgICAgICAjIHRvIHRoZSBsZWZ0IGJ5IDEuCiAgICAgICAgICAgIG5lZ19sb2dfbGlrZWxpaG9vZCA9IG91dHB1dHMubG9zcwoKICAgICAgICBubGxzLmFwcGVuZChuZWdfbG9nX2xpa2VsaWhvb2QpCgogICAgICAgIHByZXZfZW5kX2xvYyA9IGVuZF9sb2MKICAgICAgICBpZiBlbmRfbG9jID09IHNlcV9sZW46CiAgICAgICAgICAgIGJyZWFrCgogICAgcHBsID0gdG9yY2guZXhwKHRvcmNoLnN0YWNrKG5sbHMpLm1lYW4oKSkuaXRlbSgpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoInBlcnBsZXhpdHkiLCBwcGwpCg==
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgoKaW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgTG9yYUNvbmZpZywgUGVmdE1vZGVsLCBnZXRfcGVmdF9tb2RlbCwgcHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZwpmcm9tIHBsb3RseSBpbXBvcnQgZ3JhcGhfb2JqZWN0cyBhcyBnbwpmcm9tIHRyYW5zZm9ybWVycyBpbXBvcnQgKAogICAgQXV0b01vZGVsRm9yQ2F1c2FsTE0sCiAgICBBdXRvVG9rZW5pemVyLAogICAgQml0c0FuZEJ5dGVzQ29uZmlnLAogICAgRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZywKICAgIFByZVRyYWluZWRNb2RlbCwKICAgIFByZVRyYWluZWRUb2tlbml6ZXIsCiAgICBUcmFpbmVyQ2FsbGJhY2ssCiAgICBUcmFpbmVyQ29udHJvbCwKICAgIFRyYWluZXJTdGF0ZSwKICAgIFRyYWluaW5nQXJndW1lbnRzLAopCgoKY2xhc3MgQ29uZmlnS2V5czoKICAgIGRlZXBzcGVlZCA9ICJkZWVwc3BlZWQiCiAgICBxdWFudGl6YXRpb24gPSAicXVhbnRpemF0aW9uIgogICAgdHJhaW5pbmcgPSAidHJhaW5pbmciCiAgICB0b2tlbml6ZXJfcHJldHJhaW5lZCA9ICJ0b2tlbml6ZXJfcHJldHJhaW5lZCIKICAgIG1vZGVsX3ByZXRyYWluZWQgPSAibW9kZWxfcHJldHJhaW5lZCIKICAgIHBlZnRfY29uZmlnID0gInBlZnQiCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBpZiAiLyIgaW4gbmFtZToKICAgICAgICAgICAgbmFtZSA9ICJfIi5qb2luKG5hbWUuc3BsaXQoIi8iKSkKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiBEUE9UcmFpbmVyLAogICAgbW9kZWxfbmFtZTogc3RyID0gTm9uZSwKICAgIHRhZzogc3RyID0gIiIsCiAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICBhdXRvX2xvZzogYm9vbCA9IFRydWUsCiAgICBsYWJlbHM6IERpY3Rbc3RyLCBzdHJdID0gTm9uZSwKICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKiprd2FyZ3MsCik6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICAiIiIKICAgICMgR2V0IHBhcmFtZXRlcnMgZGVmYXVsdHM6CiAgICBpZiBjb250ZXh0IGlzIE5vbmU6CiAgICAgICAgY29udGV4dCA9IG1scnVuLmdldF9vcl9jcmVhdGVfY3R4KEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLkRFRkFVTFRfQ09OVEVYVF9OQU1FKQoKICAgIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLmFkZF9pbnRlcmZhY2Uob2JqPXRyYWluZXIpCgogICAgaWYgYXV0b19sb2c6CiAgICAgICAgdHJhaW5lci5hZGRfY2FsbGJhY2soCiAgICAgICAgICAgIE1MUnVuQ2FsbGJhY2soCiAgICAgICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgICAgICBtb2RlbF9uYW1lPW1vZGVsX25hbWUsCiAgICAgICAgICAgICAgICB0YWc9dGFnLAogICAgICAgICAgICAgICAgbGFiZWxzPWxhYmVscywKICAgICAgICAgICAgICAgIGV4dHJhX2RhdGE9ZXh0cmFfZGF0YSwKICAgICAgICAgICAgKQogICAgICAgICkKCgojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS1lbmQgZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgoKZGVmIF9wcmludF90cmFpbmFibGVfcGFyYW1ldGVycyhtb2RlbCk6CiAgICAiIiIKICAgIFByaW50cyB0aGUgbnVtYmVyIG9mIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbC4KICAgICIiIgogICAgdHJhaW5hYmxlX3BhcmFtcyA9IDAKICAgIGFsbF9wYXJhbSA9IDAKICAgIGZvciBfLCBwYXJhbSBpbiBtb2RlbC5uYW1lZF9wYXJhbWV0ZXJzKCk6CiAgICAgICAgYWxsX3BhcmFtICs9IHBhcmFtLm51bWVsKCkKICAgICAgICBpZiBwYXJhbS5yZXF1aXJlc19ncmFkOgogICAgICAgICAgICB0cmFpbmFibGVfcGFyYW1zICs9IHBhcmFtLm51bWVsKCkKICAgIHByaW50KAogICAgICAgIGYidHJhaW5hYmxlIHBhcmFtczoge3RyYWluYWJsZV9wYXJhbXN9IHx8IGFsbCBwYXJhbXM6IHthbGxfcGFyYW19IHx8IHRyYWluYWJsZSU6IgogICAgICAgIGYiIHsxMDAgKiB0cmFpbmFibGVfcGFyYW1zIC8gYWxsX3BhcmFtfSIKICAgICkKCgojIGRlZmF1bHQgY29uZmlncwojIHdpbGwgYmUgdXNlZCBpZiB1c2VyIHByb3ZpZGVzICJUcnVlIiB3aXRoIGNvbmZpZyBuYW1lIGFzIGlucHV0ClFVQU5USVpBVElPTl9DT05GSUcgPSB0cmFuc2Zvcm1lcnMuQml0c0FuZEJ5dGVzQ29uZmlnKAogICAgbG9hZF9pbl80Yml0PVRydWUsCiAgICBibmJfNGJpdF91c2VfZG91YmxlX3F1YW50PVRydWUsCiAgICBibmJfNGJpdF9xdWFudF90eXBlPSJuZjQiLAogICAgYm5iXzRiaXRfY29tcHV0ZV9kdHlwZT10b3JjaC5iZmxvYXQxNiwKKQoKUEVGVF9DT05GSUcgPSBwZWZ0LkxvcmFDb25maWcoCiAgICByPTE2LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsKICAgICAgICAicV9wcm9qIiwKICAgICAgICAia19wcm9qIiwKICAgICAgICAidl9wcm9qIiwKICAgICAgICAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwKICAgICAgICAidXBfcHJvaiIsCiAgICAgICAgImRvd25fcHJvaiIsCiAgICBdLAogICAgbG9yYV9kcm9wb3V0PTAuMDUsCiAgICBiaWFzPSJub25lIiwKICAgIHRhc2tfdHlwZT0iQ0FVU0FMX0xNIiwKKQoKREVFUFNQRUVEX0NPTkZJRyA9IHsKICAgICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiAiYXV0byIsCiAgICAiZnAxNiI6IHsiZW5hYmxlZCI6IFRydWV9LAogICAgImF1dG90dW5pbmciOiB7CiAgICAgICAgImVuYWJsZWQiOiBUcnVlLAogICAgICAgICJhcmdfbWFwcGluZ3MiOiB7CiAgICAgICAgICAgICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiAiLS1wZXJfZGV2aWNlX3RyYWluX2JhdGNoX3NpemUiLAogICAgICAgICAgICAiZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzICI6ICItLWdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyIsCiAgICAgICAgfSwKICAgIH0sCiAgICAiemVyb19vcHRpbWl6YXRpb24iOiB7CiAgICAgICAgInN0YWdlIjogMiwKICAgIH0sCn0KCgpkZWYgX3VwZGF0ZV9jb25maWcoc3JjOiBkaWN0LCBkc3Q6IGRpY3QpOgogICAgIiIiCiAgICB1cGRhdGUgY29uZmlncyBhY2NvcmRpbmcgdG8gdXNlciwgdGhpcyB3YXkgdGhlIHVzZXIgY2FuIGFkZC9tb2RpZnkgdmFsdWVzIGluIGRlZmF1bHQgY29uZmlncyBmb3IgZS5nLgoKICAgIGdvZXMgb3ZlciBhbGwgY29uZmlncyBhbmQgY29ycmVzcG9uZGluZyBwcmVmaXhlcywgY29sbGVjdCBhbGwgdGhlIGtleXMgZnJvbSB0aGUgZ2l2ZW4gZGljdCB0aGF0IHN0YXJ0CiAgICAgd2l0aCB0aGUgcHJlZml4IGFuZCBhZGQgdGhlbSB0byBhcHByb3ByaWF0ZSBjb25maWcKCiAgICA6cGFyYW0gc3JjOiBkaWN0IG9mIGFsbCBjYW5kaWRhdGUgdmFsdWVzIHRvIHVwZGF0ZSBkaWN0LgogICAgOnBhcmFtIGRzdDogZGljdCBjb250YWluaW5nIGFsbCBjb25maWdzIHRvIHVwZGF0ZS4KICAgICIiIgoKICAgIGZvciBjb25maWdfbmFtZSwgY29uZmlnIGluIGRzdC5pdGVtcygpOgoKICAgICAgICAjIElmIGdpdmVuIFRydWUgd2UgdXNlIGRlZmF1bHQgZGljdAogICAgICAgICMgQ2FuIGFsc28gYmUgRmFsc2Ugb3IgYSBjb25maWcgZGljdCBnaXZlbiBmcm9tIHVzZXIsIHNvIHdlIGNoZWNrIHNwZWNpZmljYWxseSBmbyBUcnVlCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJxdWFudGl6YXRpb24iOgogICAgICAgICAgICBjb25maWcgPSBRVUFOVElaQVRJT05fQ09ORklHCgogICAgICAgIGlmIGNvbmZpZyBpcyBUcnVlIGFuZCBjb25maWdfbmFtZSA9PSAicGVmdCI6CiAgICAgICAgICAgIGNvbmZpZyA9IFBFRlRfQ09ORklHCgogICAgICAgIGlmIGNvbmZpZyBpcyBUcnVlIGFuZCBjb25maWdfbmFtZSA9PSAiZGVlcHNwZWVkIjoKICAgICAgICAgICAgY29uZmlnID0gREVFUFNQRUVEX0NPTkZJRwoKICAgICAgICAjIGluIHNvbWUgY2FzZXMgd2UgY2FuIGdldCBhIGJvb2xlYW4gdmFsdWUsIGluIHRoYXQgY2FzZSBubyBuZWVkIHRvIGxvb2sgZm9yIGFyZ3MKICAgICAgICBpZiBpc2luc3RhbmNlKGNvbmZpZywgYm9vbCk6CiAgICAgICAgICAgIGNvbmZpZyA9IE5vbmUKCiAgICAgICAgZWxpZiBpc2luc3RhbmNlKGNvbmZpZywgZGljdCk6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBjb25maWdba2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKV0gPSB2YWwKCiAgICAgICAgIyB1cGRhdGUgYnkgY29uZmlnIG5hbWUKICAgICAgICBlbHNlOgogICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gc3JjLml0ZW1zKCk6CiAgICAgICAgICAgICAgICBpZiBrZXkuc3RhcnRzd2l0aChjb25maWdfbmFtZSk6CiAgICAgICAgICAgICAgICAgICAgc2V0YXR0cihjb25maWcsIGtleS5yZXBsYWNlKGYie2NvbmZpZ19uYW1lfV8iLCAiIiksIHZhbCkKCiAgICAgICAgZHN0LnVwZGF0ZSh7Y29uZmlnX25hbWU6IGNvbmZpZ30pCgoKZGVmIF9nZXRfY2xhc3Nfb2JqZWN0KGNsYXNzX3BhdGg6IHN0cikgLT4gdHlwZToKICAgICIiIgogICAgZ2l2ZW4gYSBmdWxsIGNsYXNzIG5hbWUsIHRoaXMgZnVuY3Rpb24gcmV0dXJucyB0aGUgY29ycmVjdCBjbGFzcwoKICAgIDpwYXJhbSBjbGFzc19wYXRoOiBhIGZ1bGwgY2xhc3MgbmFtZSAoZXguICd0cmFuc2Zvcm1lcnMuQXV0b01vZGVsRm9yQ2F1c2FsTE0nKQoKICAgIDpyZXR1cm4gdGhlIHdhbnRlZCBjbGFzcyBvYmplY3QKICAgICIiIgogICAgbW9kdWxlX3BhdGgsIGNsYXNzX25hbWUgPSBjbGFzc19wYXRoLnJzcGxpdCgiLiIsIDEpCiAgICBtb2R1bGUgPSBpbXBvcnRsaWIuaW1wb3J0X21vZHVsZShtb2R1bGVfcGF0aCkKICAgIHJldHVybiBnZXRhdHRyKG1vZHVsZSwgY2xhc3NfbmFtZSkKCgpkZWYgX3NldF9tb2RlbF9hbmRfdG9rZW5pemVyKAogICAgbW9kZWw6IFVuaW9uW3N0ciwgTGlzdFtzdHJdXSwKICAgIHRva2VuaXplcjogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdGFzazogc3RyLAogICAgZnJhbWV3b3JrOiBzdHIsCiAgICBxdWFudGl6YXRpb25fY29uZmlnOiBkaWN0LAogICAgdXNlX2N1ZGE6IGJvb2wsCiAgICB0b2tlbml6ZXJfcHJldHJhaW5lZF9jb25maWcsCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZywKICAgIGRldmljZV9tYXA6IHN0ciwKKToKICAgICIiIgogICAgZ2V0IHRoZSBjb3JyZWN0IG1vZGVsIGFuZCB0b2tlbml6ZXIgYWNjb3JkaW5nIHRvIGdpdmVuIHVzZXIgaW5wdXRzCgogICAgOnBhcmFtIG1vZGVsOiBhIHR1cGxlIGNvbnRhaW5pbmcgbW9kZWwgbmFtZSBhbmQgY2xhc3MsIG9yIHN0ciB3aXRoIG1vZGVsIG5hbWUgb3IgcGF0aAogICAgOnBhcmFtIHRva2VuaXplcjogYSB0dXBsZSBjb250YWluaW5nIHRva2VuaXplciBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggdG9rZW5pemVyIG5hbWUgb3IgcGF0aAogICAgOnBhcmFtIHRhc2s6IGEgc3VwcG9ydGVkIG5scCB0YXNrLCB1c2VkIHRvIGNob29zZSBtb2RlbCBpZiBub3QgcHJvdmlkZWQKICAgIDpwYXJhbSBmcmFtZXdvcms6IHB0IG9yIHRmCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogcXVhbnRpemF0aW9uIGNvbmZpZyBvciBOb25lLCB0byBsb2FkIG1vZGVsIGluIGFwcHJvcHJpYXRlIHdheQogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gZGV2aWNlX21hcDogYSBkZXZpY2UgbWFwIGZvciBtb2RlbCB0cmFpbmluZyBpZiB1c2luZyBudW1iZXIgb2YgZ3B1J3MKCiAgICA6cmV0dXJuczogbW9kZWwgYW5kIHRva2VuaXplcgogICAgIiIiCiAgICAjIGxvYWQgbW9kZWwgZnJvbSBzdG9yZQogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgc3RyKSBhbmQgaXNfc3RvcmVfdXJpKG1vZGVsKToKICAgICAgICBwYXNzCiAgICAgICAgIyBUT0RPOiBsb2FkIGJvdGggbW9kZWwgYW5kIHRva2VuaXplciBhbmQgcmV0dXJuLCBuZWVkIGd1eSdzIGhlbHAKCiAgICAjIGlmIGl0J3MgYSB0dXBsZSB0aGVtIHdlIGFzc3VtZSBpdCBjb250YWlucyBvZiBib3RoIG5hbWUgYW5kIGNsYXNzCiAgICBpZiBpc2luc3RhbmNlKG1vZGVsLCBsaXN0KToKICAgICAgICBtb2RlbF9uYW1lLCBtb2RlbF9jbGFzcyA9IG1vZGVsCiAgICAgICAgbW9kZWxfY2xhc3MgPSBfZ2V0X2NsYXNzX29iamVjdChtb2RlbF9jbGFzcykKCiAgICAjIGluIHRoZSBjYXNlIHdlIGRvbid0IGdldCB0aGUgbW9kZWwgY2xhc3Mgd2UgbmVlZCB0aGUgdGFzayBpbiBvcmRlciB0byBjaG9vc2UgdGhlIGNvcnJlY3QgbW9kZWwKICAgIGVsc2U6CiAgICAgICAgaWYgdGFzayBpcyBOb25lOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRhc2sgbXVzdCBiZSBjaG9zZW4gaW4gb3JkZXIgdG8gZGV0ZXJtaW5lIHRoZSBjb3JyZWN0IG1vZGVsIikKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKAogICAgICAgICAgICAgICAgInRoaXMgZnVuY3Rpb24gcmVxdWlyZXMgZWl0aGVyIGEgc3VwcG9ydGVkIHRhc2sgb3IgYSBtb2RlbCBhbmQgbW9kZWwgY2xhc3MgdG8gYmUgY2hvc2VuIgogICAgICAgICAgICApCgogICAgICAgIF8sIGF2YWlsYWJsZV9jbGFzc2VzLCB0YXNrX29wdGlvbnMgPSB0cmFuc2Zvcm1lcnMucGlwZWxpbmVzLmNoZWNrX3Rhc2sodGFzaykKCiAgICAgICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgc3RyKToKICAgICAgICAgICAgbW9kZWxfbmFtZSA9IG1vZGVsCgogICAgICAgICMgaWYgbW9kZWwgaXMgbm90IGdpdmVuLCB3ZSB0YWtlIHRoZSBkZWZhdWx0IG1vZGVsIGZvciB0aGUgZ2l2ZW4gdGFzawogICAgICAgIGVsc2U6CiAgICAgICAgICAgIG1vZGVsX25hbWUsIF8gPSB0cmFuc2Zvcm1lcnMucGlwZWxpbmVzLmdldF9kZWZhdWx0X21vZGVsX2FuZF9yZXZpc2lvbigKICAgICAgICAgICAgICAgIGF2YWlsYWJsZV9jbGFzc2VzLCBmcmFtZXdvcmssIHRhc2tfb3B0aW9ucwogICAgICAgICAgICApCiAgICAgICAgaWYgbm90IGF2YWlsYWJsZV9jbGFzc2VzLmdldChmcmFtZXdvcmssIHR1cGxlKCkpOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoCiAgICAgICAgICAgICAgICAiZ2l2ZW4gdGFzaydzIGRlZmF1bHQgbW9kZWwgaXMgbm90IHN1cHBvcnRlZCBpbiBzcGVjaWZpZWQgZnJhbWV3b3JrIgogICAgICAgICAgICApCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBtb2RlbF9jbGFzcyA9IGF2YWlsYWJsZV9jbGFzc2VzW2ZyYW1ld29ya11bMF0KCiAgICAjIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIGlmIHVzZV9jdWRhOgogICAgICAgIGRldmljZV9tYXAgPSBkZXZpY2VfbWFwCiAgICBlbHNlOgogICAgICAgIGRldmljZV9tYXAgPSBOb25lCgogICAgbW9kZWwgPSBtb2RlbF9jbGFzcy5mcm9tX3ByZXRyYWluZWQoCiAgICAgICAgbW9kZWxfbmFtZSwKICAgICAgICBxdWFudGl6YXRpb25fY29uZmlnPXF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgICAgICoqbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICApCgogICAgIyBJZiBxdWFudGl6YXRpb24gY29uZmlnIGlzIGdpdmVuIHdlIHdpbGwgbG9hZCBhIHF1YW50aXplZCBtb2RlbCwgaWYgbm90IGEgcmVndWxhciBvbmUKICAgIGlmIHF1YW50aXphdGlvbl9jb25maWc6CiAgICAgICAgbW9kZWwuZ3JhZGllbnRfY2hlY2twb2ludGluZ19lbmFibGUoKQogICAgICAgIG1vZGVsID0gcGVmdC5wcmVwYXJlX21vZGVsX2Zvcl9rYml0X3RyYWluaW5nKG1vZGVsKQoKICAgICMgaWYgbm90IHNwZWNpZmllZCB3ZSBjaG9vc2UgdGhlIGRlZmF1bHQgdG9rZW5pemVyIHRoYXQgY29ycmVzcG9uZGluZyB0byB0aGUgbW9kZWwKICAgIGlmIHRva2VuaXplciBpcyBOb25lOgogICAgICAgIHRva2VuaXplciA9IHRyYW5zZm9ybWVycy5BdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChtb2RlbF9uYW1lKQogICAgICAgIHJldHVybiBtb2RlbF9uYW1lLCBtb2RlbCwgdG9rZW5pemVyCgogICAgaWYgaXNpbnN0YW5jZSh0b2tlbml6ZXIsIHN0cik6CiAgICAgICAgdG9rZW5pemVyX25hbWUgPSB0b2tlbml6ZXIKICAgICAgICB0b2tlbml6ZXJfY2xhc3MgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplcgoKICAgICMgaWYgaXQncyBub3QgYSBzdHIgdGhlbiBpdCdzIGEgdHVwbGUgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgZWxzZToKICAgICAgICB0b2tlbml6ZXJfbmFtZSwgdG9rZW5pemVyX2NsYXNzID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QodG9rZW5pemVyX2NsYXNzKQoKICAgIHRva2VuaXplciA9IHRva2VuaXplcl9jbGFzcy5mcm9tX3ByZXRyYWluZWQoCiAgICAgICAgdG9rZW5pemVyX25hbWUsICoqdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnCiAgICApCgogICAgdG9rZW5pemVyLnBhZF90b2tlbiA9IHRva2VuaXplci5lb3NfdG9rZW4KCiAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKCmRlZiBfZGF0YXNldF9sb2FkZXIoZGF0YXNldDogc3RyLCBpc190cmFpbjogYm9vbCA9IFRydWUsICoqa3dhcmdzKSAtPiBEYXRhc2V0OgogICAgIiIiCiAgICBsb2FkcyB0aGUgc3BlY2lmaWMgZGF0YXNldCBwcm92aWRlZCBieSB0aGUgdXNlcgoKICAgIDpwYXJhbSBkYXRhc2V0OiBuYW1lIG9yIHBhdGggb2YgZGF0YXNldCB0byBsb2FkCiAgICA6cGFyYW0gaXNfdHJhaW46IGJvb2wgdGhhdCBpbmRpY2F0ZXMgdGhlIHB1cnBvc2Ugb2YgdGhlIGRhdGFzZXQKICAgIDpwYXJhbSBrd2FyZ3M6IG90aGVyIGt3YXJncyBmb3IgbG9hZGluZyB0aGUgZGF0YXNldAoKICAgIDpyZXR1cm5zOiBsb2FkZWQgZGF0YXNldAogICAgIiIiCiAgICAjIGlmIHNwbGl0IGluIGt3YXJncyB0aGVuIHRoZSB1c2VyIGRlY2lkZXMgaG93IHRvIHNwbGl0IHRoZSBkYXRhc2V0CiAgICBpZiAic3BsaXQiIGluIGt3YXJnczoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBhIGRhdGFzZXQgZm9yIHRyYWluIHdlIHNwbGl0IHdpdGggdHJhaW4KICAgIGlmIGlzX3RyYWluOgogICAgICAgIHJldHVybiBsb2FkX2RhdGFzZXQoZGF0YXNldCwgc3BsaXQ9InRyYWluIiwgKiprd2FyZ3MpCgogICAgIyBpZiBpdCdzIGV2YWwgZGF0YXNldCwgdGhlbiBhIGxvdCBvZiBuYW1lcyBhcmUgYWNjZXB0YWJsZSBmb3IgdGhlIHNldCBhbmQgd2UgY2hlY2sgYWxsIG9mIHRoZW0KICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQoZGF0YXNldCwgKiprd2FyZ3MpCiAgICBpZiAidGVzdCIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoInRlc3QiKQogICAgZWxpZiAiZXZhbCIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoImV2YWwiKQogICAgZWxpZiAidmFsaWRhdGlvbiIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgcmV0dXJuIGRhdGFzZXQKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwoKICAgIDpyZXR1cm5zOiB0b2tlbml6ZWQgZGF0YXNldHMKICAgICIiIgoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgIyBpZiBwcm92aWRlZCB0d28gcGF0aHMvbmFtZXMgd2UgbG9hZCBlYWNoIHNlcGFyYXRlbHkgdXNpbmcgZGVzaWduYXRlZCBmdW5jCiAgICBpZiBldmFsX2RhdGFzZXQ6CiAgICAgICAgdHJhaW5fZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD10cmFpbl9kYXRhc2V0LCBpc190cmFpbj1UcnVlLCAqKnRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3MKICAgICAgICApCiAgICAgICAgZXZhbF9kYXRhc2V0ID0gX2RhdGFzZXRfbG9hZGVyKAogICAgICAgICAgICBkYXRhc2V0PWV2YWxfZGF0YXNldCwgaXNfdHJhaW49RmFsc2UsICoqZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgIyBpZiBvbmx5IG9uIHBhdGggaXMgZ2l2ZW4gdGhlbiB3ZSBtdXN0IGNoZWNrIGlmIGl0IGNvbnRhaW5zIGJvdGggZGF0YXNldCBvciBpZiBvbmx5IG9uZSBzaG91bGQgYmUgdXNlZAogICAgZWxzZToKICAgICAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KHRyYWluX2RhdGFzZXQsICoqdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncykKICAgICAgICBpZiAidHJhaW4iIGluIGRhdGFzZXQ6CiAgICAgICAgICAgIHRyYWluX2RhdGFzZXQgPSBkYXRhc2V0LmdldCgidHJhaW4iKQogICAgICAgICAgICBpZiAidGVzdCIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgICAgICAgICAgZWxpZiAiZXZhbCIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJldmFsIikKICAgICAgICAgICAgZWxpZiAidmFsaWRhdGlvbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJldHVybiB0cmFpbl9kYXRhc2V0CiAgICAgICAgZWxzZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0cmFpbiBkYXRhc2V0IGlzIG1hbmRhdG9yeSIpCiAgICAgICAgICAgIHJhaXNlIEtleUVycm9yKCJubyB0cmFpbiBkYXRhc2V0IGZvdW5kIGluIGdpdmVuIGRhdGFzZXQiKQoKICAgIHJldHVybiB0cmFpbl9kYXRhc2V0LCBldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRm9ybSBhIGRwbyB0cmFpbmluZyBqb2IgdG8gZG8gbGxtIGFsaWdubWVudAogICAgIFRoZSBmdW5jdGlvbiB0YWtlcyB2YXJpb3VzIGNvbmZpZ3VyYXRpb24gcGFyYW1ldGVycyB0byBjdXN0b21pemUgdGhlIHRyYWluaW5nIHByb2Nlc3MKICAgICBhbmQgYWRhcHQgdGhlIG1vZGVsIHRvIHNwZWNpZmljIHRhc2tzIHVzaW5nIGEgcHJvdmlkZWQgZGF0YXNldC4KCiAgICA6cGFyYW0gY29udGV4dDogbWxydW4gY29udGV4dCBpbiBvcmRlciB0byBsb2cgdHJhaW5lZCBtb2RlbAogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IFRoZSB0cmFpbiBkYXRhc2V0IHVzZWQgZm9yIGZpbmUtdHVuaW5nIHRoZSBsYW5ndWFnZSBtb2RlbC4KICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IFRoZSBldmFsIGRhdGFzZXQgdXNlZCBmb3IgZXZhbHVhdGUgdGhlIGxhbmd1YWdlIG1vZGVsIGR1cmluZyB0cmFpbmluZy4KICAgIDpwYXJhbSB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJnczoga3dhcmdzIGZvciBkYXRhc2V0IGxvYWRpbmcKICAgIDpwYXJhbSBtb2RlbDogYSB0dXBsZSBjb250YWluaW5nIG1vZGVsIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCBtb2RlbCBuYW1lIG9yIHBhdGgKICAgIDpwYXJhbSB0b2tlbml6ZXI6IGEgdHVwbGUgY29udGFpbmluZyB0b2tlbml6ZXIgbmFtZSBhbmQgY2xhc3MsIG9yIHN0ciB3aXRoIHRva2VuaXplciBuYW1lIG9yIHBhdGgKICAgIDpwYXJhbSBkZWVwc3BlZWRfY29uZmlnOiBDb25maWd1cmF0aW9uIG9wdGlvbnMgZm9yIERlZXBTcGVlZCAob3B0aW9uYWwpLgogICAgOnBhcmFtIHF1YW50aXphdGlvbl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgbW9kZWwgcXVhbnRpemF0aW9uIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcGVmdF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgTG93LVJhbmsgQXBwcm94aW1hdGlvbiAoTG9SQSkgKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBiZXRhOiBzdXBlciBwYXJhbWV0ZXIgb2YgS0wgZGl2ZXJnZW5jZQogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBkYXRhX2NvbGxhdG9yX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBkYXRhIGNvbGxhdGlvbiBkdXJpbmcgdHJhaW5pbmcgKG9wdGlvbmFsKS4KICAgIDpwYXJhbSB0YXNrOiBBIGRlc2NyaXB0aW9uIG9mIHRoZSBzcGVjaWZpYyB0YXNrIHRoZSBtb2RlbCBpcyBiZWluZyBmaW5lLXR1bmVkIGZvci4KICAgIDpwYXJhbSB1c2VfY3VkYTogdXNlIGdwdSBvciBub3QKICAgIDpwYXJhbSBmcmFtZXdvcms6IHB0IG90IHRmCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBMb29rIGZvciB1cGRhdGVzIHRvIGNvbmZpZ3MgZ2l2ZW4gaW4ga3dhcmdzCiAgICBjb25maWdzID0gewogICAgICAgIENvbmZpZ0tleXMuZGVlcHNwZWVkOiBkZWVwc3BlZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMucXVhbnRpemF0aW9uOiBxdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMudHJhaW5pbmc6IHRyYWluaW5nX2NvbmZpZywKICAgICAgICBDb25maWdLZXlzLm1vZGVsX3ByZXRyYWluZWQ6IG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMudG9rZW5pemVyX3ByZXRyYWluZWQ6IHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBDb25maWdLZXlzLmRhdGFfY29sbGF0b3I6IGRhdGFfY29sbGF0b3JfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMucGVmdF9jb25maWc6IHBlZnRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuYmV0YTogYmV0YSwKICAgIH0KICAgIF91cGRhdGVfY29uZmlnKGRzdD1jb25maWdzLCBzcmM9a3dhcmdzKQoKICAgICMgY2hlY2sgZ3B1IHBlcm1pc3Npb24gYW5kIGF2YWlsYWJpbGl0eQogICAgaWYgdXNlX2N1ZGE6CiAgICAgICAgaWYgdG9yY2guY3VkYS5pc19hdmFpbGFibGUoKToKICAgICAgICAgICAgIyBDbGVhbiBncHUgY2FjaGUKICAgICAgICAgICAgdG9yY2guY3VkYS5lbXB0eV9jYWNoZSgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgbG9nZ2VyLndhcm5pbmcoIid1c2VfY3VkYScgaXMgc2V0IHRvIFRydWUsIGJ1dCBubyBjdWRhIGRldmljZSBpcyBhdmFpbGFibGUiKQoKICAgICMgZ2V0IG1vZGVsIGFuZCB0b2tlbml6ZXIKICAgIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIgPSBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICAgICAgbW9kZWw9bW9kZWwsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBxdWFudGl6YXRpb25fY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5xdWFudGl6YXRpb25dLAogICAgICAgIHVzZV9jdWRhPXVzZV9jdWRhLAogICAgICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZz10b2tlbml6ZXJfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgbW9kZWxfcHJldHJhaW5lZF9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLm1vZGVsX3ByZXRyYWluZWRdLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICkKICAgIHRyYWluX2RhdGFzZXQsIGV2YWxfZGF0YXNldCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldCwgZXZhbF9kYXRhc2V0LCB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLCBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IERQT1RyYWluZXIoCiAgICAgICAgbW9kZWw9bW9kZWwsCiAgICAgICAgcmVmX21vZGVsPU5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgcGVmdF9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnBlZnRfY29uZmlnXSwKICAgICAgICBiZXRhPWNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGFyZ3M9dHJhaW5pbmdfYXJncywKICAgICAgICBtYXhfbGVuZ3RoPTIwNDgsCiAgICAgICAgbWF4X3Byb21wdF9sZW5ndGg9NDA5NiwKICAgICkKCiAgICBhcHBseV9tbHJ1bih0cmFpbmVyLCBtb2RlbF9uYW1lPW1vZGVsX25hbWUuc3BsaXQoIi8iKVstMV0pCiAgICBtb2RlbC5jb25maWcudXNlX2NhY2hlID0gKAogICAgICAgIEZhbHNlICAjIHNpbGVuY2UgdGhlIHdhcm5pbmdzLiBQbGVhc2UgcmUtZW5hYmxlIGZvciBpbmZlcmVuY2UhCiAgICApCgogICAgIyBBcHBseSB0cmFpbmluZyB3aXRoIGV2YWx1YXRpb246CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYidHJhaW5pbmcgJ3ttb2RlbF9uYW1lfSciKQogICAgdHJhaW5lci50cmFpbigpCgogICAgdGVtcF9kaXJlY3RvcnkgPSB0ZW1wZmlsZS5UZW1wb3JhcnlEaXJlY3RvcnkoKS5uYW1lCiAgICB0cmFpbmVyLnNhdmVfbW9kZWwodGVtcF9kaXJlY3RvcnkpCgogICAgIyBaaXAgdGhlIG1vZGVsIGRpcmVjdG9yeToKICAgIHNodXRpbC5tYWtlX2FyY2hpdmUoCiAgICAgICAgYmFzZV9uYW1lPSJtb2RlbCIsCiAgICAgICAgZm9ybWF0PSJ6aXAiLAogICAgICAgIHJvb3RfZGlyPXRlbXBfZGlyZWN0b3J5LAogICAgKQoKICAgICMgTG9nIHRoZSBtb2RlbDoKICAgIGNvbnRleHQubG9nX21vZGVsKAogICAgICAgIGtleT0ibW9kZWwiLAogICAgICAgIGRiX2tleT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdLAogICAgICAgIG1vZGVsX2ZpbGU9Im1vZGVsLnppcCIsCiAgICAgICAgdGFnPSIiLAogICAgICAgIGZyYW1ld29yaz0iSHVnZ2luZyBGYWNlIiwKICAgICkKCgpkZWYgZXZhbHVhdGUoCiAgICBjb250ZXh0LAogICAgbW9kZWxfcGF0aCwKICAgIGRhdGE6IHBkLkRhdGFGcmFtZSwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0b2tlbml6ZXJfbmFtZTogc3RyID0gTm9uZSwKKToKICAgICIiIgogICAgRXZhbHVhdGluZyB0aGUgbW9kZWwgdXNpbmcgcGVycGxleGl0eSwgZm9yIG1vcmUgaW5mb3JtYXRpb24gdmlzaXQ6CiAgICBodHRwczovL2h1Z2dpbmdmYWNlLmNvL2RvY3MvdHJhbnNmb3JtZXJzL3BlcnBsZXhpdHkKCiAgICA6cGFyYW0gY29udGV4dDogICAgIG1scnVuIGNvbnRleHQKICAgIDpwYXJhbSBtb2RlbF9wYXRoOiAgcGF0aCB0byB0aGUgbW9kZWwgZGlyZWN0b3J5CiAgICA6cGFyYW0gZGF0YTogICAgICAgIHRoZSBkYXRhIHRvIGV2YWx1YXRlIHRoZSBtb2RlbAogICAgOnBhcmFtIG1vZGVsX25hbWU6ICBuYW1lIG9mIGJhc2UgbW9kZWwKICAgIDpwYXJhbSB0b2tlbml6ZXJfbmFtZTogbmFtZSBvZiBiYXNlIHRva2VuaXplcgogICAgIiIiCiAgICAjIEdldCB0aGUgbW9kZWwgYXJ0aWZhY3QgYW5kIGZpbGU6CiAgICAoCiAgICAgICAgbW9kZWxfZmlsZSwKICAgICAgICBtb2RlbF9hcnRpZmFjdCwKICAgICAgICBleHRyYV9kYXRhLAogICAgKSA9IG1scnVuLmFydGlmYWN0cy5nZXRfbW9kZWwobW9kZWxfcGF0aCkKCiAgICAjIFJlYWQgdGhlIG5hbWU6CiAgICBfbW9kZWxfbmFtZSA9IG1vZGVsX2FydGlmYWN0LnNwZWMuZGJfa2V5CgogICAgIyBFeHRyYWN0IGxvZ2dlZCBtb2RlbCBmaWxlczoKICAgIG1vZGVsX2RpcmVjdG9yeSA9IG9zLnBhdGguam9pbihvcy5wYXRoLmRpcm5hbWUobW9kZWxfZmlsZSksIF9tb2RlbF9uYW1lKQogICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfZmlsZToKICAgICAgICB6aXBfZmlsZS5leHRyYWN0YWxsKG1vZGVsX2RpcmVjdG9yeSkKCiAgICAjIExvYWRpbmcgdGhlIHNhdmVkIHByZXRyYWluZWQgdG9rZW5pemVyIGFuZCBtb2RlbDoKICAgIGRhdGFzZXQgPSBEYXRhc2V0LmZyb21fcGFuZGFzKGRhdGEpCiAgICB0b2tlbml6ZXIgPSBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCh0b2tlbml6ZXJfbmFtZSkKICAgIHBhZF90b2tlbl9pZCA9IHRva2VuaXplci5lb3NfdG9rZW5faWQKICAgIG1vZGVsID0gQXV0b01vZGVsRm9yQ2F1c2FsTE0uZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsIGRldmljZV9tYXA9ImN1ZGE6MCIsIHRydXN0X3JlbW90ZV9jb2RlPVRydWUsIGxvYWRfaW5fOGJpdD1UcnVlCiAgICApCiAgICBtb2RlbCA9IFBlZnRNb2RlbC5mcm9tX3ByZXRyYWluZWQobW9kZWwsIG1vZGVsX2RpcmVjdG9yeSkKICAgIG1vZGVsLmV2YWwoKQogICAgZW5jb2RpbmdzID0gdG9rZW5pemVyKCJcblxuIi5qb2luKGRhdGFzZXRbInRleHQiXVs6NV0pLCByZXR1cm5fdGVuc29ycz0icHQiKQoKICAgIG1heF9sZW5ndGggPSAxMDI0CiAgICBzdHJpZGUgPSA1MTIKICAgIHNlcV9sZW4gPSBlbmNvZGluZ3MuaW5wdXRfaWRzLnNpemUoMSkKCiAgICBubGxzID0gW10KICAgIHByZXZfZW5kX2xvYyA9IDAKICAgIGZvciBiZWdpbl9sb2MgaW4gcmFuZ2UoMCwgc2VxX2xlbiwgc3RyaWRlKToKICAgICAgICBlbmRfbG9jID0gbWluKGJlZ2luX2xvYyArIG1heF9sZW5ndGgsIHNlcV9sZW4pCiAgICAgICAgdHJnX2xlbiA9IGVuZF9sb2MgLSBwcmV2X2VuZF9sb2MgICMgbWF5IGJlIGRpZmZlcmVudCBmcm9tIHN0cmlkZSBvbiBsYXN0IGxvb3AKICAgICAgICBpbnB1dF9pZHMgPSBlbmNvZGluZ3MuaW5wdXRfaWRzWzosIGJlZ2luX2xvYzplbmRfbG9jXQogICAgICAgIHRhcmdldF9pZHMgPSBpbnB1dF9pZHMuY2xvbmUoKQogICAgICAgIHRhcmdldF9pZHNbOiwgOi10cmdfbGVuXSA9IC0xMDAKCiAgICAgICAgd2l0aCB0b3JjaC5ub19ncmFkKCk6CiAgICAgICAgICAgIG91dHB1dHMgPSBtb2RlbChpbnB1dF9pZHMuY3VkYSgpLCBsYWJlbHM9dGFyZ2V0X2lkcykKCiAgICAgICAgICAgICMgbG9zcyBpcyBjYWxjdWxhdGVkIHVzaW5nIENyb3NzRW50cm9weUxvc3Mgd2hpY2ggYXZlcmFnZXMgb3ZlciB2YWxpZCBsYWJlbHMKICAgICAgICAgICAgIyBOLkIuIHRoZSBtb2RlbCBvbmx5IGNhbGN1bGF0ZXMgbG9zcyBvdmVyIHRyZ19sZW4gLSAxIGxhYmVscywgYmVjYXVzZSBpdCBpbnRlcm5hbGx5IHNoaWZ0cyB0aGUgbGFiZWxzCiAgICAgICAgICAgICMgdG8gdGhlIGxlZnQgYnkgMS4KICAgICAgICAgICAgbmVnX2xvZ19saWtlbGlob29kID0gb3V0cHV0cy5sb3NzCgogICAgICAgIG5sbHMuYXBwZW5kKG5lZ19sb2dfbGlrZWxpaG9vZCkKCiAgICAgICAgcHJldl9lbmRfbG9jID0gZW5kX2xvYwogICAgICAgIGlmIGVuZF9sb2MgPT0gc2VxX2xlbjoKICAgICAgICAgICAgYnJlYWsKCiAgICBwcGwgPSB0b3JjaC5leHAodG9yY2guc3RhY2sobmxscykubWVhbigpKS5pdGVtKCkKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgicGVycGxleGl0eSIsIHBwbCkK
     commands: []
     code_origin: ''
     origin_filename: ''
@@ -34,7 +34,7 @@ spec:
         default: null
       outputs:
       - default: ''
-      lineno: 72
+      lineno: 79
     mlrun_train:
       name: mlrun_train
       doc: ''
@@ -43,7 +43,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 82
+      lineno: 89
     wrapper:
       name: wrapper
       doc: ''
@@ -53,7 +53,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 83
+      lineno: 90
     on_epoch_begin:
       name: on_epoch_begin
       doc: ''
@@ -71,7 +71,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 131
+      lineno: 138
     on_epoch_end:
       name: on_epoch_end
       doc: ''
@@ -89,7 +89,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 142
+      lineno: 149
     on_log:
       name: on_log
       doc: ''
@@ -110,7 +110,7 @@ spec:
         default: null
       outputs:
       - default: ''
-      lineno: 153
+      lineno: 160
     on_train_begin:
       name: on_train_begin
       doc: ''
@@ -128,7 +128,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 179
+      lineno: 186
     on_train_end:
       name: on_train_end
       doc: ''
@@ -152,7 +152,7 @@ spec:
         default: null
       outputs:
       - default: ''
-      lineno: 190
+      lineno: 197
     on_evaluate:
       name: on_evaluate
       doc: ''
@@ -170,7 +170,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 203
+      lineno: 210
     log_metrics:
       name: log_metrics
       doc: ''
@@ -179,7 +179,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 217
+      lineno: 224
     log_metric_plot:
       name: log_metric_plot
       doc: ''
@@ -194,7 +194,7 @@ spec:
         default: ''
       outputs:
       - default: ''
-      lineno: 224
+      lineno: 231
     apply_mlrun:
       name: apply_mlrun
       doc: This is temporary and will be built in mlrun 1.5.0
@@ -222,13 +222,12 @@ spec:
         default: null
       outputs:
       - default: ''
-      lineno: 246
+      lineno: 255
     dpo_train:
       name: dpo_train
-      doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\
-        \ dataset.\n The function takes various configuration parameters to customize\
-        \ the training process\n and adapt the model to specific tasks using a provided\
-        \ dataset."
+      doc: "Form a dpo training job to do llm alignment\n The function takes various\
+        \ configuration parameters to customize the training process\n and adapt the\
+        \ model to specific tasks using a provided dataset."
       parameters:
       - name: context
         type: MLClientCtx
@@ -250,10 +249,6 @@ spec:
         type: dict
         doc: kwargs for dataset loading
         default: {}
-      - name: dataset_columns_to_train
-        type: Union[str, list]
-        doc: which columns to pass to the model as inputs
-        default: text
       - name: model
         type: Union[str, List[str]]
         doc: a tuple containing model name and class, or str with model name or path
@@ -273,9 +268,11 @@ spec:
         default: false
       - name: peft_config
         type: Union[dict, bool]
+        doc: Configuration options for Low-Rank Approximation (LoRA) (optional).
         default: false
       - name: beta
         type: Union[float, bool]
+        doc: super parameter of KL divergence
         default: false
       - name: training_config
         type: dict
@@ -310,7 +307,7 @@ spec:
         default: auto
       outputs:
       - default: ''
-      lineno: 627
+      lineno: 583
     evaluate:
       name: evaluate
       doc: 'Evaluating the model using perplexity, for more information visit:
@@ -337,7 +334,7 @@ spec:
         default: null
       outputs:
       - default: ''
-      lineno: 785
+      lineno: 726
   description: doing the alignment with dpo trainer
   default_handler: dpo_train
   disable_auto_mount: false

From bbc2fa2001f7b7eb39a845526fc882eb985c6e88 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 05:13:45 +0000
Subject: [PATCH 28/33] update the test case

---
 huggingface_dpo/test_huggingface_dpo_trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index f073aafb5..63f3f50c8 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -31,15 +31,15 @@ def test_dpo_fn():
         "do_eval": False,
         "optim": "paged_adamw_8bit",
         "per_device_train_batch_size": 1,
-        "gradient_accumulation_steps": 4,
+        "gradient_accumulation_steps": 1,
         "per_device_eval_batch_size": 1,
         "log_level": "info",
-        "save_steps": 5,
+        "save_steps": 1,
         "learning_rate": 5e-7,
         "eval_steps": 1,
         "num_train_epochs": 1,
-        "max_steps": 5,
-        "warmup_steps": 5,
+        "max_steps": 1,
+        "warmup_steps": 1,
         "fp16": True,
         "lr_scheduler_type": "cosine",
         "remove_unused_columns": True,

From c781ecf4f1aec77f2d9a692330975ba3171b0f8e Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 05:17:14 +0000
Subject: [PATCH 29/33] passed the test case

---
 .../test_huggingface_dpo_trainer.py           | 41 -------------------
 1 file changed, 41 deletions(-)

diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 63f3f50c8..1aa31707e 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -21,7 +21,6 @@
 def test_dpo_fn():
     model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name
-    # dop_trainer = mlrun.import_function("function.yaml")
 
     ctx = mlrun.get_or_create_ctx(name="test_dpo")
     train_dataset = "unalignment/toxic-dpo-v0.2"
@@ -56,43 +55,3 @@ def test_dpo_fn():
         use_cuda=True,
         beta=0.1,
     )
-
-
-def test_dpo_train():
-
-    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
-    tokenizer = model_name
-    dop_trainer = mlrun.import_function("function.yaml")
-
-    training_arguments = {
-        "per_device_train_batch_size": 4,
-        "gradient_accumulation_steps": 1,
-        "warmup_steps": 2,
-        "max_steps": 10,
-        "learning_rate": 2e-4,
-        "logging_steps": 1,
-    }
-
-    params = {
-        "model": (model_name, "transformers.AutoModelForCausalLM"),
-        "ref_model": None,
-        "tokenizer": tokenizer,
-        "train_dataset": "Abirate/english_quotes",
-        "training_config": training_arguments,
-        "dataset_columns_to_train": "quote",
-        "model_pretrained_config": {"use_cache": False},
-        "use_cuda": False,
-    }
-
-    try:
-        with tempfile.TemporaryDirectory() as test_directory:
-            dpo_trainer.run(
-                local=True,
-                params=params,
-                handler="dpo_train",
-                returns=["model"],
-                workdir=test_directory,
-            )
-
-    except Exception as exception:
-        print(f"- The training failed - raised the following error:\n- {exception}")

From 2f5361e72827d84c70dd240a5857b6b9e8459785 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 16:26:38 +0000
Subject: [PATCH 30/33] adding the function yaml to the test case

---
 .../test_huggingface_dpo_trainer.py           | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 1aa31707e..98783c644 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 import tempfile
-from huggingface_dpo_trainer import dpo_train
 
+# from huggingface_dpo_trainer import dpo_train
 import mlrun
 
 
 def test_dpo_fn():
+    dpo_trainer = mlrun.import_function("function.yaml")
     model_name = "mistralai/Mistral-7B-Instruct-v0.2"
     tokenizer = model_name
 
@@ -44,14 +45,24 @@ def test_dpo_fn():
         "remove_unused_columns": True,
         "gradient_checkpointing": True,
     }
-    dpo_train(
-        context=ctx,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        peft_config=True,
-        model=model_name,
-        tokenizer=tokenizer,
-        training_config=training_arguments,
-        use_cuda=True,
-        beta=0.1,
-    )
+    params = {
+        "model": model_name,
+        "tokenizer": tokenizer,
+        "train_dataset": train_dataset,
+        "eval_dataset": eval_dataset,
+        "peft_config": True,
+        "training_config": training_arguments,
+        "use_cuda": True,
+        "beta": 0.1,
+    }
+    try:
+        with tempfile.TemporaryDirectory() as test_directory:
+            dpo_trainer.run(
+                local=True,
+                params=params,
+                handler="dpo_train",
+                returns=["model"],
+                workdir=test_directory,
+            )
+    except Exception as exception:
+        print(f"-The training failed -raised the following error: \n -{exception}")

From d63b755d4e84f58184529cb8a81f9344d07278c5 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 16:30:47 +0000
Subject: [PATCH 31/33] should be good for the notebook

---
 huggingface_dpo/test_huggingface_dpo_trainer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py
index 98783c644..db289b51e 100644
--- a/huggingface_dpo/test_huggingface_dpo_trainer.py
+++ b/huggingface_dpo/test_huggingface_dpo_trainer.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import tempfile
-
-# from huggingface_dpo_trainer import dpo_train
 import mlrun
 
 

From 5d1ccc444d89cd041c4ffe2d06da384f5bdf1507 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 16:54:43 +0000
Subject: [PATCH 32/33] adding the notebook and raise the PR

---
 huggingface_dpo/huggingface_dpo_trainer.ipynb | 285 ++++++++++++++++++
 1 file changed, 285 insertions(+)
 create mode 100644 huggingface_dpo/huggingface_dpo_trainer.ipynb

diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb
new file mode 100644
index 000000000..b0b0f60ae
--- /dev/null
+++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d",
+   "metadata": {},
+   "source": [
+    "# DPO trainer for llm alignment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b",
+   "metadata": {},
+   "source": [
+    "## Notebook Introduction: Doing the llm alignment with DPO trainer\n",
+    "\n",
+    "In this notebook, we will walk you through a step-by-step process of how to do alignment for a SOTA llm with DPO method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "425249e9-f43f-45e6-aa25-9f53099049cd",
+   "metadata": {},
+   "source": [
+    "### First, we will select the model we wish to align and take the matching tokenizer and appropriate config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n",
+    "from transformers import logging\n",
+    "\n",
+    "logging.set_verbosity(\"CRITICAL\")\n",
+    "\n",
+    "model_name = \"mistralai/Mistral-7B-Instruct-v0.2\"\n",
+    "tokenizer = model_name\n",
+    "generation_config = GenerationConfig.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230",
+   "metadata": {},
+   "source": [
+    "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2024-04-01 16:49:17,440 [info] Project loaded successfully: {'project_name': 'dpo-trainer-test'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mlrun\n",
+    "\n",
+    "project = mlrun.get_or_create_project(\n",
+    "    name=\"dpo-trainer-test\",\n",
+    "    context=\"./\",\n",
+    "    user_project=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d56b834f-adf6-4736-8de7-3348e050f561",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<mlrun.projects.project.MlrunProject at 0x7f46038f9f10>"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "project.set_function(\n",
+    "    \"huggingface_dpo_trainer.py\",\n",
+    "    name=\"dpo-trainer\",\n",
+    "    kind=\"local\",\n",
+    "    handler=\"dpo_train\",\n",
+    ")\n",
+    "project.save()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47",
+   "metadata": {},
+   "source": [
+    "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n",
+    "eval_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n",
+    "training_arguments = {\n",
+    "    \"evaluation_strategy\": \"steps\",\n",
+    "    \"do_eval\": True,\n",
+    "    \"optim\": \"paged_adamw_8bit\",\n",
+    "    \"per_device_train_batch_size\": 1,\n",
+    "    \"gradient_accumulation_steps\": 1,\n",
+    "    \"per_device_eval_batch_size\": 1,\n",
+    "    \"log_level\": \"info\",\n",
+    "    \"save_steps\": 1,\n",
+    "    \"learning_rate\": 5e-7,\n",
+    "    \"eval_steps\": 1,\n",
+    "    \"num_train_epochs\": 1,\n",
+    "    \"max_steps\": 1,\n",
+    "    \"warmup_steps\": 1,\n",
+    "    \"fp16\": True,\n",
+    "    \"lr_scheduler_type\": \"cosine\",\n",
+    "    \"remove_unused_columns\": True,\n",
+    "    \"gradient_checkpointing\": True,\n",
+    "}\n",
+    "params = {\n",
+    "    \"model\": model_name,\n",
+    "    \"tokenizer\": tokenizer,\n",
+    "    \"train_dataset\": train_dataset,\n",
+    "    \"eval_dataset\": eval_dataset,\n",
+    "    \"peft_config\": True,\n",
+    "    \"training_config\": training_arguments,\n",
+    "    \"use_cuda\": True,\n",
+    "    \"beta\": 0.1,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4",
+   "metadata": {},
+   "source": [
+    "### Now we simply run the function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11ab5888-5870-4bf8-9657-db930adecd77",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2024-04-01 16:49:20,738 [info] Storing function: {'name': 'dpo-trainer', 'uid': 'b4ed0d2bdc8c4e44892aee1a3549969d', 'db': 'http://mlrun-api:8080'}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a28ff59fc674c4aac2e2ee2d1bf0211",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 7241732096 || all params: 7241732096 || trainable%: 100.0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments we have set it for you, but you should do it yourself in the future.\n",
+      "Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: \n",
+      "dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)\n",
+      "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.\n",
+      "max_steps is given, it will override any value given in num_train_epochs\n",
+      "Using auto half precision backend\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2024-04-01 16:49:40,542 [info] training 'mistralai/Mistral-7B-Instruct-v0.2'\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "***** Running training *****\n",
+      "  Num examples = 541\n",
+      "  Num Epochs = 1\n",
+      "  Instantaneous batch size per device = 1\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 1\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 1\n",
+      "  Number of trainable parameters = 41,943,040\n",
+      "torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "None of the inputs have requires_grad=True. Gradients will be None\n",
+      "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 541\n",
+      "  Batch size = 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_run = mlrun.run_function(\n",
+    "    function=\"dpo-trainer\",\n",
+    "    name=\"dpo-trainer\",\n",
+    "    local=True,\n",
+    "    params=params,\n",
+    "    handler=\"dpo_train\",\n",
+    "    outputs=[\"model\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dpo",
+   "language": "python",
+   "name": "conda-env-.conda-dpo-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From bf66dfd8944d3532431a3e09687e4310c72bd3f0 Mon Sep 17 00:00:00 2001
From: peng wei <pengwei715@gmail.com>
Date: Mon, 1 Apr 2024 16:57:34 +0000
Subject: [PATCH 33/33] raise the PR

---
 huggingface_dpo/huggingface_dpo_trainer.ipynb | 322 +++++++++++++++++-
 1 file changed, 320 insertions(+), 2 deletions(-)

diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb
index b0b0f60ae..07dfcf024 100644
--- a/huggingface_dpo/huggingface_dpo_trainer.ipynb
+++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb
@@ -161,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "11ab5888-5870-4bf8-9657-db930adecd77",
    "metadata": {},
    "outputs": [
@@ -229,7 +229,325 @@
       "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n",
       "***** Running Evaluation *****\n",
       "  Num examples = 541\n",
-      "  Batch size = 1\n"
+      "  Batch size = 1\n",
+      "Saving model checkpoint to /tmp/tmp1k687jql/tmp-checkpoint-1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_train_loss': 0.6931472420692444, 'eval_train_runtime': 365.1876, 'eval_train_samples_per_second': 1.481, 'eval_train_steps_per_second': 1.481, 'eval_rewards/chosen': 0.0, 'eval_rewards/rejected': 0.0, 'eval_rewards/accuracies': 0.0, 'eval_rewards/margins': 0.0, 'eval_logps/rejected': -127.08296203613281, 'eval_logps/chosen': -328.57867431640625, 'eval_logits/rejected': -2.3305602073669434, 'eval_logits/chosen': -2.911039113998413, 'epoch': 0.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n",
+      "Model config MistralConfig {\n",
+      "  \"architectures\": [\n",
+      "    \"MistralForCausalLM\"\n",
+      "  ],\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"bos_token_id\": 1,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"silu\",\n",
+      "  \"hidden_size\": 4096,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 14336,\n",
+      "  \"max_position_embeddings\": 32768,\n",
+      "  \"model_type\": \"mistral\",\n",
+      "  \"num_attention_heads\": 32,\n",
+      "  \"num_hidden_layers\": 32,\n",
+      "  \"num_key_value_heads\": 8,\n",
+      "  \"rms_norm_eps\": 1e-05,\n",
+      "  \"rope_theta\": 1000000.0,\n",
+      "  \"sliding_window\": null,\n",
+      "  \"tie_word_embeddings\": false,\n",
+      "  \"torch_dtype\": \"bfloat16\",\n",
+      "  \"transformers_version\": \"4.38.2\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 32000\n",
+      "}\n",
+      "\n",
+      "tokenizer config file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/tokenizer_config.json\n",
+      "Special tokens file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/special_tokens_map.json\n",
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "Saving model checkpoint to /tmp/tmpe5yijcu0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 367.9669, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'train_loss': 0.6931471824645996, 'epoch': 0.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n",
+      "Model config MistralConfig {\n",
+      "  \"architectures\": [\n",
+      "    \"MistralForCausalLM\"\n",
+      "  ],\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"bos_token_id\": 1,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"silu\",\n",
+      "  \"hidden_size\": 4096,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 14336,\n",
+      "  \"max_position_embeddings\": 32768,\n",
+      "  \"model_type\": \"mistral\",\n",
+      "  \"num_attention_heads\": 32,\n",
+      "  \"num_hidden_layers\": 32,\n",
+      "  \"num_key_value_heads\": 8,\n",
+      "  \"rms_norm_eps\": 1e-05,\n",
+      "  \"rope_theta\": 1000000.0,\n",
+      "  \"sliding_window\": null,\n",
+      "  \"tie_word_embeddings\": false,\n",
+      "  \"torch_dtype\": \"bfloat16\",\n",
+      "  \"transformers_version\": \"4.38.2\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 32000\n",
+      "}\n",
+      "\n",
+      "tokenizer config file saved in /tmp/tmpe5yijcu0/tokenizer_config.json\n",
+      "Special tokens file saved in /tmp/tmpe5yijcu0/special_tokens_map.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       ".dictlist {\n",
+       "  background-color: #4EC64B;\n",
+       "  text-align: center;\n",
+       "  margin: 4px;\n",
+       "  border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n",
+       ".artifact {\n",
+       "  cursor: pointer;\n",
+       "  background-color: #4EC64B;\n",
+       "  text-align: left;\n",
+       "  margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n",
+       "}\n",
+       "div.block.hidden {\n",
+       "  display: none;\n",
+       "}\n",
+       ".clickable {\n",
+       "  cursor: pointer;\n",
+       "}\n",
+       ".ellipsis {\n",
+       "  display: inline-block;\n",
+       "  max-width: 60px;\n",
+       "  white-space: nowrap;\n",
+       "  overflow: hidden;\n",
+       "  text-overflow: ellipsis;\n",
+       "}\n",
+       ".master-wrapper {\n",
+       "  display: flex;\n",
+       "  flex-flow: row nowrap;\n",
+       "  justify-content: flex-start;\n",
+       "  align-items: stretch;\n",
+       "}\n",
+       ".master-tbl {\n",
+       "  flex: 3\n",
+       "}\n",
+       ".master-wrapper > div {\n",
+       "  margin: 4px;\n",
+       "  padding: 10px;\n",
+       "}\n",
+       "iframe.fileview {\n",
+       "  border: 0 none;\n",
+       "  height: 100%;\n",
+       "  width: 100%;\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       ".pane-header-title {\n",
+       "  width: 80%;\n",
+       "  font-weight: 500;\n",
+       "}\n",
+       ".pane-header {\n",
+       "  line-height: 1;\n",
+       "  background-color: #4EC64B;\n",
+       "  padding: 3px;\n",
+       "}\n",
+       ".pane-header .close {\n",
+       "  font-size: 20px;\n",
+       "  font-weight: 700;\n",
+       "  float: right;\n",
+       "  margin-top: -5px;\n",
+       "}\n",
+       ".master-wrapper .right-pane {\n",
+       "  border: 1px inset silver;\n",
+       "  width: 40%;\n",
+       "  min-height: 300px;\n",
+       "  flex: 3\n",
+       "  min-width: 500px;\n",
+       "}\n",
+       ".master-wrapper * {\n",
+       "  box-sizing: border-box;\n",
+       "}\n",
+       "</style><script>\n",
+       "function copyToClipboard(fld) {\n",
+       "    if (document.queryCommandSupported && document.queryCommandSupported('copy')) {\n",
+       "        var textarea = document.createElement('textarea');\n",
+       "        textarea.textContent = fld.innerHTML;\n",
+       "        textarea.style.position = 'fixed';\n",
+       "        document.body.appendChild(textarea);\n",
+       "        textarea.select();\n",
+       "\n",
+       "        try {\n",
+       "            return document.execCommand('copy'); // Security exception may be thrown by some browsers.\n",
+       "        } catch (ex) {\n",
+       "\n",
+       "        } finally {\n",
+       "            document.body.removeChild(textarea);\n",
+       "        }\n",
+       "    }\n",
+       "}\n",
+       "function expandPanel(el) {\n",
+       "  const panelName = \"#\" + el.getAttribute('paneName');\n",
+       "  console.log(el.title);\n",
+       "\n",
+       "  document.querySelector(panelName + \"-title\").innerHTML = el.title\n",
+       "  iframe = document.querySelector(panelName + \"-body\");\n",
+       "\n",
+       "  const tblcss = `<style> body { font-family: Arial, Helvetica, sans-serif;}\n",
+       "    #csv { margin-bottom: 15px; }\n",
+       "    #csv table { border-collapse: collapse;}\n",
+       "    #csv table td { padding: 4px 8px; border: 1px solid silver;} </style>`;\n",
+       "\n",
+       "  function csvToHtmlTable(str) {\n",
+       "    return '<div id=\"csv\"><table><tr><td>' +  str.replace(/[\\n\\r]+$/g, '').replace(/[\\n\\r]+/g, '</td></tr><tr><td>')\n",
+       "      .replace(/,/g, '</td><td>') + '</td></tr></table></div>';\n",
+       "  }\n",
+       "\n",
+       "  function reqListener () {\n",
+       "    if (el.title.endsWith(\".csv\")) {\n",
+       "      iframe.setAttribute(\"srcdoc\", tblcss + csvToHtmlTable(this.responseText));\n",
+       "    } else {\n",
+       "      iframe.setAttribute(\"srcdoc\", this.responseText);\n",
+       "    }\n",
+       "    console.log(this.responseText);\n",
+       "  }\n",
+       "\n",
+       "  const oReq = new XMLHttpRequest();\n",
+       "  oReq.addEventListener(\"load\", reqListener);\n",
+       "  oReq.open(\"GET\", el.title);\n",
+       "  oReq.send();\n",
+       "\n",
+       "\n",
+       "  //iframe.src = el.title;\n",
+       "  const resultPane = document.querySelector(panelName + \"-pane\");\n",
+       "  if (resultPane.classList.contains(\"hidden\")) {\n",
+       "    resultPane.classList.remove(\"hidden\");\n",
+       "  }\n",
+       "}\n",
+       "function closePanel(el) {\n",
+       "  const panelName = \"#\" + el.getAttribute('paneName')\n",
+       "  const resultPane = document.querySelector(panelName + \"-pane\");\n",
+       "  if (!resultPane.classList.contains(\"hidden\")) {\n",
+       "    resultPane.classList.add(\"hidden\");\n",
+       "  }\n",
+       "}\n",
+       "\n",
+       "</script>\n",
+       "<div class=\"master-wrapper\">\n",
+       "  <div class=\"block master-tbl\"><div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>project</th>\n",
+       "      <th>uid</th>\n",
+       "      <th>iter</th>\n",
+       "      <th>start</th>\n",
+       "      <th>state</th>\n",
+       "      <th>name</th>\n",
+       "      <th>labels</th>\n",
+       "      <th>inputs</th>\n",
+       "      <th>parameters</th>\n",
+       "      <th>results</th>\n",
+       "      <th>artifacts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>dpo-trainer-test-pengwei</td>\n",
+       "      <td><div title=\"b4ed0d2bdc8c4e44892aee1a3549969d\"><a href=\"https://dashboard.default-tenant.app.llm-dev.iguazio-cd1.com/mlprojects/dpo-trainer-test-pengwei/jobs/monitor/b4ed0d2bdc8c4e44892aee1a3549969d/overview\" target=\"_blank\" >...3549969d</a></div></td>\n",
+       "      <td>0</td>\n",
+       "      <td>Apr 01 16:49:20</td>\n",
+       "      <td>completed</td>\n",
+       "      <td>dpo-trainer</td>\n",
+       "      <td><div class=\"dictlist\">v3io_user=pengwei</div><div class=\"dictlist\">kind=local</div><div class=\"dictlist\">owner=pengwei</div><div class=\"dictlist\">host=jupyter-pengwei-gpu-86c58c8f79-8ls8j</div></td>\n",
+       "      <td></td>\n",
+       "      <td><div class=\"dictlist\">model=mistralai/Mistral-7B-Instruct-v0.2</div><div class=\"dictlist\">tokenizer=mistralai/Mistral-7B-Instruct-v0.2</div><div class=\"dictlist\">train_dataset=unalignment/toxic-dpo-v0.2</div><div class=\"dictlist\">eval_dataset=unalignment/toxic-dpo-v0.2</div><div class=\"dictlist\">peft_config=True</div><div class=\"dictlist\">training_config={'evaluation_strategy': 'steps', 'do_eval': False, 'optim': 'paged_adamw_8bit', 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 1, 'per_device_eval_batch_size': 1, 'log_level': 'info', 'save_steps': 1, 'learning_rate': 5e-07, 'eval_steps': 1, 'num_train_epochs': 1, 'max_steps': 1, 'warmup_steps': 1, 'fp16': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': True, 'gradient_checkpointing': True}</div><div class=\"dictlist\">use_cuda=True</div><div class=\"dictlist\">beta=0.1</div></td>\n",
+       "      <td><div class=\"dictlist\">eval_train_loss=0.6931472420692444</div><div class=\"dictlist\">eval_train_runtime=365.1876</div><div class=\"dictlist\">eval_train_samples_per_second=1.481</div><div class=\"dictlist\">eval_train_steps_per_second=1.481</div><div class=\"dictlist\">eval_rewards/chosen=0.0</div><div class=\"dictlist\">eval_rewards/rejected=0.0</div><div class=\"dictlist\">eval_rewards/accuracies=0.0</div><div class=\"dictlist\">eval_rewards/margins=0.0</div><div class=\"dictlist\">eval_logps/rejected=-127.08296203613281</div><div class=\"dictlist\">eval_logps/chosen=-328.57867431640625</div><div class=\"dictlist\">eval_logits/rejected=-2.3305602073669434</div><div class=\"dictlist\">eval_logits/chosen=-2.911039113998413</div><div class=\"dictlist\">train_runtime=367.9669</div><div class=\"dictlist\">train_samples_per_second=0.003</div><div class=\"dictlist\">train_steps_per_second=0.003</div><div class=\"dictlist\">total_flos=0.0</div><div class=\"dictlist\">train_loss=0.6931471824645996</div></td>\n",
+       "      <td><div title=\"v3io:///projects/dpo-trainer-test-pengwei/artifacts/dpo-trainer/0/model/\">model</div></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div></div>\n",
+       "  <div id=\"resultae8c4455-pane\" class=\"right-pane block hidden\">\n",
+       "    <div class=\"pane-header\">\n",
+       "      <span id=\"resultae8c4455-title\" class=\"pane-header-title\">Title</span>\n",
+       "      <span onclick=\"closePanel(this)\" paneName=\"resultae8c4455\" class=\"close clickable\">&times;</span>\n",
+       "    </div>\n",
+       "    <iframe class=\"fileview\" id=\"resultae8c4455-body\"></iframe>\n",
+       "  </div>\n",
+       "</div>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<b> > to track results use the .show() or .logs() methods  or <a href=\"https://dashboard.default-tenant.app.llm-dev.iguazio-cd1.com/mlprojects/dpo-trainer-test-pengwei/jobs/monitor/b4ed0d2bdc8c4e44892aee1a3549969d/overview\" target=\"_blank\">click here</a> to open in UI</b>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2024-04-01 16:55:57,867 [info] Run execution finished: {'status': 'completed', 'name': 'dpo-trainer'}\n"
      ]
     }
    ],