From 549ec1b6c7a5dadf2bd9efc349886462dd3c4279 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 6 Mar 2024 12:55:42 -0800 Subject: [PATCH 01/33] make some changes for the auto trainer using the DPO trainer --- huggingface_dpo/huggingface_auto_trainer.py | 855 ++++++++++++++++++++ huggingface_dpo/huggingface_dpo.py | 855 ++++++++++++++++++++ 2 files changed, 1710 insertions(+) create mode 100644 huggingface_dpo/huggingface_auto_trainer.py create mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py new file mode 100644 index 000000000..d1166318c --- /dev/null +++ b/huggingface_dpo/huggingface_auto_trainer.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..d1166318c --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) From cca1e7ee28c169171b39f50095ed03f43ffd390c Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 6 Mar 2024 12:56:13 -0800 Subject: [PATCH 02/33] adding the dpo from trl --- huggingface_dpo/huggingface_auto_trainer.py | 855 -------------------- 1 file changed, 855 deletions(-) delete mode 100644 huggingface_dpo/huggingface_auto_trainer.py diff --git a/huggingface_dpo/huggingface_auto_trainer.py b/huggingface_dpo/huggingface_auto_trainer.py deleted file mode 100644 index d1166318c..000000000 --- a/huggingface_dpo/huggingface_auto_trainer.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) From 01c1d08bd35d449db3d56e5fc28632426cd998b8 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 19:05:45 -0700 Subject: [PATCH 03/33] should use dpo_trainer for dpo training --- huggingface_dpo/huggingface_dpo.py | 1 + huggingface_dpo/test_huggingface_dpo.py | 42 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index d1166318c..bf2ed3cf0 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -17,6 +17,7 @@ from mlrun.datastore import is_store_uri from mlrun.frameworks._common import CommonTypes, MLRunInterface from mlrun.utils import logger +from trl import DPOTrainer from peft import (LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training) from plotly import graph_objects as go diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py new file mode 100644 index 000000000..53576e4e7 --- /dev/null +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -0,0 +1,42 @@ +import tempfile + +import mlrun + + +def test_train(): + + model_name = "distilgpt2" + tokenizer = model_name + auto_trainer = mlrun.import_function("function.yaml") + + training_arguments = { + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 1, + "warmup_steps": 2, + "max_steps": 10, + "learning_rate": 2e-4, + "logging_steps": 1, + } + + params = { + "model": (model_name, "transformers.AutoModelForCausalLM"), + "tokenizer": tokenizer, + "train_dataset": "Abirate/english_quotes", + "training_config": training_arguments, + "dataset_columns_to_train": "quote", + "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, + } + + try: + with tempfile.TemporaryDirectory() as test_directory: + auto_trainer.run( + local=True, + params=params, + handler="finetune_llm", + returns=["model"], + workdir=test_directory, + ) + + except Exception as exception: + print(f"- The training failed - raised the following error:\n- {exception}") From c691afcb446c079426051fce3ca7e7d45bd12809 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:20:16 +0000 Subject: [PATCH 04/33] adding the req --- huggingface_dpo/requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 huggingface_dpo/requirements.txt diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt new file mode 100644 index 000000000..1376b1d00 --- /dev/null +++ b/huggingface_dpo/requirements.txt @@ -0,0 +1,5 @@ +peft +transformers +torch +datasets +plotly From c010d6d6bf0bbeaa6ae24ba25b5037e2ec4486c3 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:22:28 +0000 Subject: [PATCH 05/33] using the dpo trainer --- huggingface_dpo/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index 1376b1d00..a86a25fb4 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -3,3 +3,4 @@ transformers torch datasets plotly +trl From 95b5ce53b58fde34e9ed232ab1cb65b3ecc3f58b Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 02:53:00 +0000 Subject: [PATCH 06/33] adding the mlrun --- huggingface_dpo/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index a86a25fb4..215b90562 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -4,3 +4,4 @@ torch datasets plotly trl +mlrun From 49159194b520be1514caba7e73f824638254888c Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:10:07 -0700 Subject: [PATCH 07/33] adding the dpo trainer --- huggingface_dpo/huggingface_dpo.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index bf2ed3cf0..a8c46b768 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -44,11 +44,12 @@ class ConfigKeys: deepspeed = "deepspeed" quantization = "quantization" - lora = "lora" training = "training" tokenizer_pretrained = "tokenizer_pretrained" model_pretrained = "model_pretrained" + peft_config = "peft_config" data_collator = "data_collator" + beta = "beta" # ----------------------from MLRUN-------------------------------- @@ -70,7 +71,7 @@ class HFTrainerMLRunInterface(MLRunInterface, ABC): @classmethod def add_interface( cls, - obj: Trainer, + obj: DPOTrainer, restoration: CommonTypes.MLRunInterfaceRestorationType = None, ): super(HFTrainerMLRunInterface, cls).add_interface( @@ -79,7 +80,7 @@ def add_interface( @classmethod def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): + def wrapper(self: DPOTrainer, *args, **kwargs): # Restore the evaluation method as `train` will use it: # cls._restore_attribute(obj=self, attribute_name="evaluate") @@ -386,7 +387,6 @@ def _set_model_and_tokenizer( tokenizer: Union[str, List[str]], task: str, framework: str, - lora_config: dict, quantization_config: dict, use_cuda: bool, tokenizer_pretrained_config, @@ -400,7 +400,6 @@ def _set_model_and_tokenizer( :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path :param task: a supported nlp task, used to choose model if not provided :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way :param quantization_config: quantization config or None, to load model in appropriate way :param use_cuda: use gpu or not :param tokenizer_pretrained_config: config to load the pretrained tokenizer @@ -470,10 +469,6 @@ def _set_model_and_tokenizer( model.gradient_checkpointing_enable() model = peft.prepare_model_for_kbit_training(model) - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - # if not specified we choose the default tokenizer that corresponding to the model if tokenizer is None: tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) @@ -639,7 +634,8 @@ def finetune_llm( tokenizer: Union[str, List[str]] = None, deepspeed_config: Union[dict, bool] = False, quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, tokenizer_pretrained_config: dict = {}, @@ -683,11 +679,12 @@ def finetune_llm( configs = { ConfigKeys.deepspeed: deepspeed_config, ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, ConfigKeys.training: training_config, ConfigKeys.model_pretrained: model_pretrained_config, ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, } _update_config(dst=configs, src=kwargs) @@ -705,7 +702,6 @@ def finetune_llm( tokenizer=tokenizer, task=task, framework=framework, - lora_config=configs[ConfigKeys.lora], quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, @@ -744,10 +740,13 @@ def finetune_llm( **train_kwargs, ) - trainer = transformers.Trainer( + trainer = trl.DPOTrainer( model=model, + ref_model = None, train_dataset=tokenized_train, eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], tokenizer=tokenizer, data_collator=data_collator, args=training_args, From 96c08f4d5fbf44f65e0efc1bd3ac2d6d73253ae9 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:13:01 -0700 Subject: [PATCH 08/33] add dpo trainer --- huggingface_dpo/huggingface_dpo.py | 855 ----------------------------- 1 file changed, 855 deletions(-) delete mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py deleted file mode 100644 index a8c46b768..000000000 --- a/huggingface_dpo/huggingface_dpo.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - peft_config = "peft_config" - data_collator = "data_collator" - beta = "beta" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: DPOTrainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: DPOTrainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - peft_config: Union[dict, bool] = False, - beta: Union[float, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - ConfigKeys.peft_config: peft_config, - ConfigKeys.beta: beta, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = trl.DPOTrainer( - model=model, - ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) From 1cb999eb3c4b06f7a39614384ae3a0368f5a6420 Mon Sep 17 00:00:00 2001 From: peng wei Date: Sun, 17 Mar 2024 21:13:37 -0700 Subject: [PATCH 09/33] added dpo trainer --- huggingface_dpo/huggingface_dpo.py | 855 +++++++++++++++++++++++++++++ 1 file changed, 855 insertions(+) create mode 100644 huggingface_dpo/huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..a8c46b768 --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from trl import DPOTrainer +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + peft_config = "peft_config" + data_collator = "data_collator" + beta = "beta" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: DPOTrainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: DPOTrainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = trl.DPOTrainer( + model=model, + ref_model = None, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) From 7e6af5fc35c1da8ae466a8c45d7d8cc84762edeb Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 18 Mar 2024 23:05:49 +0000 Subject: [PATCH 10/33] continue the coding --- huggingface_dpo/huggingface_dpo.py | 9 +++++---- huggingface_dpo/test_huggingface_dpo.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index a8c46b768..31e418f30 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -244,7 +244,7 @@ def log_metric_plot(self, name: str, scores: List[float]): def apply_mlrun( - trainer: transformers.Trainer, + trainer: trl.DPOTrainer, model_name: str = None, tag: str = "", context: mlrun.MLClientCtx = None, @@ -302,10 +302,11 @@ def _print_trainable_parameters(model): bnb_4bit_compute_dtype=torch.bfloat16, ) -LORA_CONFIG = peft.LoraConfig( +PEFT_CONFIG = peft.LoraConfig( r=8, - lora_alpha=32, - target_modules=["query_key_value"], + lora_alpha=16, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index 53576e4e7..691605c83 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -5,7 +5,7 @@ def test_train(): - model_name = "distilgpt2" + model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name auto_trainer = mlrun.import_function("function.yaml") From d4e0940dbd7b1aa45b66995980ef4542004cd94f Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 16:37:55 +0000 Subject: [PATCH 11/33] should be the same as trainer --- huggingface_dpo/huggingface_dpo.py | 4 ++-- huggingface_dpo/item.yaml | 23 +++++++++++++++++++++++ huggingface_dpo/test_huggingface_dpo.py | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 huggingface_dpo/item.yaml diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py index 31e418f30..5f2a680d0 100644 --- a/huggingface_dpo/huggingface_dpo.py +++ b/huggingface_dpo/huggingface_dpo.py @@ -347,7 +347,7 @@ def _update_config(src: dict, dst: dict): config = QUANTIZATION_CONFIG if config is True and config_name == "lora": - config = LORA_CONFIG + config = PEFT_CONFIG if config is True and config_name == "deepspeed": config = DEEPSPEED_CONFIG @@ -624,7 +624,7 @@ def _prepare_dataset( return tokenized_train_dataset, tokenized_eval_dataset -def finetune_llm( +def dpo_train( context: mlrun.MLClientCtx, train_dataset: Union[str, mlrun.datastore.DataItem], eval_dataset: str = None, diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml new file mode 100644 index 000000000..4f6cc1c1c --- /dev/null +++ b/huggingface_dpo/item.yaml @@ -0,0 +1,23 @@ + +apiVersion: v1 +categories: [] # List of category names +description: '' # Short description +doc: '' # Path to README.md if exists +example: '' # Path to examole notebook +generationDate: 2024-03-19 16:26:27.342027 +icon: '' # Path to icon file +labels: {} # Key values label pairs +maintainers: [] # List of maintainers +mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema +name: '' # Function name +platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema +spec: + filename: '' # Implementation file + handler: '' # Handler function name + image: '' # Base image name + kind: '' # Function kind + requirements: [] # List of Pythonic library requirements + customFields: {} # Custom spec fields + env: [] # Spec environment params +url: '' +version: 0.0.1 # Function version, should follow standard semantic versioning schema diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index 691605c83..adf70b494 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -33,7 +33,7 @@ def test_train(): auto_trainer.run( local=True, params=params, - handler="finetune_llm", + handler="dpo_train", returns=["model"], workdir=test_directory, ) From 1c26ef12b940c71a634bc9a75fa1ef0dccc38ef4 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 16:41:35 +0000 Subject: [PATCH 12/33] try generate the function.yaml --- huggingface_dpo/test_huggingface_dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py index adf70b494..7899debba 100644 --- a/huggingface_dpo/test_huggingface_dpo.py +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -3,7 +3,7 @@ import mlrun -def test_train(): +def test_dpo_train(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name From 93beb7bcb7bbb5c0bea3291ad96bbb92f742a927 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:04:21 +0000 Subject: [PATCH 13/33] adding the dop_trainer --- .../{huggingface_dpo.py => huggingface_dpo_trainer.py} | 0 .../{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename huggingface_dpo/{huggingface_dpo.py => huggingface_dpo_trainer.py} (100%) rename huggingface_dpo/{test_huggingface_dpo.py => test_huggingface_dpo_trainer.py} (100%) diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo_trainer.py similarity index 100% rename from huggingface_dpo/huggingface_dpo.py rename to huggingface_dpo/huggingface_dpo_trainer.py diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo_trainer.py similarity index 100% rename from huggingface_dpo/test_huggingface_dpo.py rename to huggingface_dpo/test_huggingface_dpo_trainer.py From a3c78626af0afe37469abb975ece3bc3c8da3de7 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:05:09 +0000 Subject: [PATCH 14/33] update item --- huggingface_dpo/item.yaml | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/huggingface_dpo/item.yaml b/huggingface_dpo/item.yaml index 4f6cc1c1c..3eff1eede 100644 --- a/huggingface_dpo/item.yaml +++ b/huggingface_dpo/item.yaml @@ -1,23 +1,23 @@ - apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: 2024-03-19 16:26:27.342027 -icon: '' # Path to icon file -labels: {} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema +categories: +- machine-learning +- model-training +description: doing the alignment with dpo trainer +doc: '' +example: huggingface_dpo_trainer.ipynb +generationDate: 2024-03-19:09-25 +hidden: false +icon: '' +labels: + author: pgw +maintainers: [] +marketplaceType: '' +name: huggingface-dpo-trainer spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements - customFields: {} # Custom spec fields - env: [] # Spec environment params + filename: huggingface_dpo_trainer.py + handler: dpo_train + image: mlrun/mlrun + kind: job + requirements: [] url: '' -version: 0.0.1 # Function version, should follow standard semantic versioning schema +version: 1.0.0 From e44d87007a404d2f40f6a45be66d0b1977ec2887 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 17:07:16 +0000 Subject: [PATCH 15/33] add function yaml file --- huggingface_dpo/function.yaml | 374 ++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 huggingface_dpo/function.yaml diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml new file mode 100644 index 000000000..d0baab33a --- /dev/null +++ b/huggingface_dpo/function.yaml @@ -0,0 +1,374 @@ +kind: job +metadata: + name: huggingface-dpo-trainer + tag: '' + hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676 + project: '' + labels: + author: pgw + categories: + - machine-learning + - model-training +spec: + command: '' + args: [] + image: mlrun/mlrun + build: + functionSourceCode: aW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgKExvcmFDb25maWcsIFBlZnRNb2RlbCwgZ2V0X3BlZnRfbW9kZWwsCiAgICAgICAgICAgICAgICAgIHByZXBhcmVfbW9kZWxfZm9yX2tiaXRfdHJhaW5pbmcpCmZyb20gcGxvdGx5IGltcG9ydCBncmFwaF9vYmplY3RzIGFzIGdvCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCAoQXV0b01vZGVsRm9yQ2F1c2FsTE0sIEF1dG9Ub2tlbml6ZXIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgQml0c0FuZEJ5dGVzQ29uZmlnLCBEYXRhQ29sbGF0b3JGb3JMYW5ndWFnZU1vZGVsaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgIFByZVRyYWluZWRNb2RlbCwgUHJlVHJhaW5lZFRva2VuaXplciwgVHJhaW5lciwKICAgICAgICAgICAgICAgICAgICAgICAgICBUcmFpbmVyQ2FsbGJhY2ssIFRyYWluZXJDb250cm9sLCBUcmFpbmVyU3RhdGUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgVHJhaW5pbmdBcmd1bWVudHMpCgpzdXBwb3J0ZWRfdGFza3MgPSBbCiAgICAicXVlc3Rpb24tYW5zd2VyaW5nIiwKICAgICJzdW1tYXJpemF0aW9uIiwKICAgICJ0YWJsZS1xdWVzdGlvbi1hbnN3ZXJpbmciLAogICAgInRleHQydGV4dC1nZW5lcmF0aW9uIiwKICAgICJ0ZXh0LWNsYXNzaWZpY2F0aW9uIiwKICAgICJzZW50aW1lbnQtYW5hbHlzaXMiLAogICAgInRleHQtZ2VuZXJhdGlvbiIsCiAgICAidG9rZW4tY2xhc3NpZmljYXRpb24iLAogICAgInRyYW5zbGF0aW9uIiwKICAgICJ0cmFuc2xhdGlvbl94eF90b195eSIsCl0KCgpjbGFzcyBDb25maWdLZXlzOgogICAgZGVlcHNwZWVkID0gImRlZXBzcGVlZCIKICAgIHF1YW50aXphdGlvbiA9ICJxdWFudGl6YXRpb24iCiAgICB0cmFpbmluZyA9ICJ0cmFpbmluZyIKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkID0gInRva2VuaXplcl9wcmV0cmFpbmVkIgogICAgbW9kZWxfcHJldHJhaW5lZCA9ICJtb2RlbF9wcmV0cmFpbmVkIgogICAgcGVmdF9jb25maWcgPSAicGVmdF9jb25maWciCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiB0cmwuRFBPVHJhaW5lciwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0YWc6IHN0ciA9ICIiLAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHggPSBOb25lLAogICAgYXV0b19sb2c6IGJvb2wgPSBUcnVlLAogICAgbGFiZWxzOiBEaWN0W3N0ciwgc3RyXSA9IE5vbmUsCiAgICBleHRyYV9kYXRhOiBkaWN0ID0gTm9uZSwKICAgICoqa3dhcmdzLAopOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgIiIiCiAgICAjIEdldCBwYXJhbWV0ZXJzIGRlZmF1bHRzOgogICAgaWYgY29udGV4dCBpcyBOb25lOgogICAgICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5ERUZBVUxUX0NPTlRFWFRfTkFNRSkKCiAgICBIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5hZGRfaW50ZXJmYWNlKG9iaj10cmFpbmVyKQoKICAgIGlmIGF1dG9fbG9nOgogICAgICAgIHRyYWluZXIuYWRkX2NhbGxiYWNrKAogICAgICAgICAgICBNTFJ1bkNhbGxiYWNrKAogICAgICAgICAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgICAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgICAgICAgICAgdGFnPXRhZywKICAgICAgICAgICAgICAgIGxhYmVscz1sYWJlbHMsCiAgICAgICAgICAgICAgICBleHRyYV9kYXRhPWV4dHJhX2RhdGEsCiAgICAgICAgICAgICkKICAgICAgICApCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZW5kIGZyb20gTUxSVU4tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQoKCmRlZiBfcHJpbnRfdHJhaW5hYmxlX3BhcmFtZXRlcnMobW9kZWwpOgogICAgIiIiCiAgICBQcmludHMgdGhlIG51bWJlciBvZiB0cmFpbmFibGUgcGFyYW1ldGVycyBpbiB0aGUgbW9kZWwuCiAgICAiIiIKICAgIHRyYWluYWJsZV9wYXJhbXMgPSAwCiAgICBhbGxfcGFyYW0gPSAwCiAgICBmb3IgXywgcGFyYW0gaW4gbW9kZWwubmFtZWRfcGFyYW1ldGVycygpOgogICAgICAgIGFsbF9wYXJhbSArPSBwYXJhbS5udW1lbCgpCiAgICAgICAgaWYgcGFyYW0ucmVxdWlyZXNfZ3JhZDoKICAgICAgICAgICAgdHJhaW5hYmxlX3BhcmFtcyArPSBwYXJhbS5udW1lbCgpCiAgICBwcmludCgKICAgICAgICBmInRyYWluYWJsZSBwYXJhbXM6IHt0cmFpbmFibGVfcGFyYW1zfSB8fCBhbGwgcGFyYW1zOiB7YWxsX3BhcmFtfSB8fCB0cmFpbmFibGUlOiIKICAgICAgICBmIiB7MTAwICogdHJhaW5hYmxlX3BhcmFtcyAvIGFsbF9wYXJhbX0iCiAgICApCgoKIyBkZWZhdWx0IGNvbmZpZ3MKIyB3aWxsIGJlIHVzZWQgaWYgdXNlciBwcm92aWRlcyAiVHJ1ZSIgd2l0aCBjb25maWcgbmFtZSBhcyBpbnB1dApRVUFOVElaQVRJT05fQ09ORklHID0gdHJhbnNmb3JtZXJzLkJpdHNBbmRCeXRlc0NvbmZpZygKICAgIGxvYWRfaW5fNGJpdD1UcnVlLAogICAgYm5iXzRiaXRfdXNlX2RvdWJsZV9xdWFudD1UcnVlLAogICAgYm5iXzRiaXRfcXVhbnRfdHlwZT0ibmY0IiwKICAgIGJuYl80Yml0X2NvbXB1dGVfZHR5cGU9dG9yY2guYmZsb2F0MTYsCikKClBFRlRfQ09ORklHID0gcGVmdC5Mb3JhQ29uZmlnKAogICAgcj04LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsicV9wcm9qIiwgImtfcHJvaiIsICJ2X3Byb2oiLCAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwgInVwX3Byb2oiLCAiZG93bl9wcm9qIl0sCiAgICBsb3JhX2Ryb3BvdXQ9MC4wNSwKICAgIGJpYXM9Im5vbmUiLAogICAgdGFza190eXBlPSJDQVVTQUxfTE0iLAopCgpERUVQU1BFRURfQ09ORklHID0gewogICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICJhdXRvIiwKICAgICJmcDE2IjogeyJlbmFibGVkIjogVHJ1ZX0sCiAgICAiYXV0b3R1bmluZyI6IHsKICAgICAgICAiZW5hYmxlZCI6IFRydWUsCiAgICAgICAgImFyZ19tYXBwaW5ncyI6IHsKICAgICAgICAgICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICItLXBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSIsCiAgICAgICAgICAgICJncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMgIjogIi0tZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzIiwKICAgICAgICB9LAogICAgfSwKICAgICJ6ZXJvX29wdGltaXphdGlvbiI6IHsKICAgICAgICAic3RhZ2UiOiAyLAogICAgfSwKfQoKCmRlZiBfdXBkYXRlX2NvbmZpZyhzcmM6IGRpY3QsIGRzdDogZGljdCk6CiAgICAiIiIKICAgIHVwZGF0ZSBjb25maWdzIGFjY29yZGluZyB0byB1c2VyLCB0aGlzIHdheSB0aGUgdXNlciBjYW4gYWRkL21vZGlmeSB2YWx1ZXMgaW4gZGVmYXVsdCBjb25maWdzIGZvciBlLmcuCgogICAgZ29lcyBvdmVyIGFsbCBjb25maWdzIGFuZCBjb3JyZXNwb25kaW5nIHByZWZpeGVzLCBjb2xsZWN0IGFsbCB0aGUga2V5cyBmcm9tIHRoZSBnaXZlbiBkaWN0IHRoYXQgc3RhcnQKICAgICB3aXRoIHRoZSBwcmVmaXggYW5kIGFkZCB0aGVtIHRvIGFwcHJvcHJpYXRlIGNvbmZpZwoKICAgIDpwYXJhbSBzcmM6IGRpY3Qgb2YgYWxsIGNhbmRpZGF0ZSB2YWx1ZXMgdG8gdXBkYXRlIGRpY3QuCiAgICA6cGFyYW0gZHN0OiBkaWN0IGNvbnRhaW5pbmcgYWxsIGNvbmZpZ3MgdG8gdXBkYXRlLgogICAgIiIiCgogICAgZm9yIGNvbmZpZ19uYW1lLCBjb25maWcgaW4gZHN0Lml0ZW1zKCk6CgogICAgICAgICMgSWYgZ2l2ZW4gVHJ1ZSB3ZSB1c2UgZGVmYXVsdCBkaWN0CiAgICAgICAgIyBDYW4gYWxzbyBiZSBGYWxzZSBvciBhIGNvbmZpZyBkaWN0IGdpdmVuIGZyb20gdXNlciwgc28gd2UgY2hlY2sgc3BlY2lmaWNhbGx5IGZvIFRydWUKICAgICAgICBpZiBjb25maWcgaXMgVHJ1ZSBhbmQgY29uZmlnX25hbWUgPT0gInF1YW50aXphdGlvbiI6CiAgICAgICAgICAgIGNvbmZpZyA9IFFVQU5USVpBVElPTl9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJsb3JhIjoKICAgICAgICAgICAgY29uZmlnID0gUEVGVF9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJkZWVwc3BlZWQiOgogICAgICAgICAgICBjb25maWcgPSBERUVQU1BFRURfQ09ORklHCgogICAgICAgICMgaW4gc29tZSBjYXNlcyB3ZSBjYW4gZ2V0IGEgYm9vbGVhbiB2YWx1ZSwgaW4gdGhhdCBjYXNlIG5vIG5lZWQgdG8gbG9vayBmb3IgYXJncwogICAgICAgIGlmIGlzaW5zdGFuY2UoY29uZmlnLCBib29sKToKICAgICAgICAgICAgY29uZmlnID0gTm9uZQoKICAgICAgICBlbGlmIGlzaW5zdGFuY2UoY29uZmlnLCBkaWN0KToKICAgICAgICAgICAgZm9yIGtleSwgdmFsIGluIHNyYy5pdGVtcygpOgogICAgICAgICAgICAgICAgaWYga2V5LnN0YXJ0c3dpdGgoY29uZmlnX25hbWUpOgogICAgICAgICAgICAgICAgICAgIGNvbmZpZ1trZXkucmVwbGFjZShmIntjb25maWdfbmFtZX1fIiwgIiIpXSA9IHZhbAoKICAgICAgICAjIHVwZGF0ZSBieSBjb25maWcgbmFtZQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBzZXRhdHRyKGNvbmZpZywga2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKSwgdmFsKQoKICAgICAgICBkc3QudXBkYXRlKHtjb25maWdfbmFtZTogY29uZmlnfSkKCgpkZWYgX2dldF9jbGFzc19vYmplY3QoY2xhc3NfcGF0aDogc3RyKSAtPiB0eXBlOgogICAgIiIiCiAgICBnaXZlbiBhIGZ1bGwgY2xhc3MgbmFtZSwgdGhpcyBmdW5jdGlvbiByZXR1cm5zIHRoZSBjb3JyZWN0IGNsYXNzCgogICAgOnBhcmFtIGNsYXNzX3BhdGg6IGEgZnVsbCBjbGFzcyBuYW1lIChleC4gJ3RyYW5zZm9ybWVycy5BdXRvTW9kZWxGb3JDYXVzYWxMTScpCgogICAgOnJldHVybiB0aGUgd2FudGVkIGNsYXNzIG9iamVjdAogICAgIiIiCiAgICBtb2R1bGVfcGF0aCwgY2xhc3NfbmFtZSA9IGNsYXNzX3BhdGgucnNwbGl0KCIuIiwgMSkKICAgIG1vZHVsZSA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKG1vZHVsZV9wYXRoKQogICAgcmV0dXJuIGdldGF0dHIobW9kdWxlLCBjbGFzc19uYW1lKQoKCmRlZiBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICBtb2RlbDogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0sCiAgICB0YXNrOiBzdHIsCiAgICBmcmFtZXdvcms6IHN0ciwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IGRpY3QsCiAgICB1c2VfY3VkYTogYm9vbCwKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgIG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgZGV2aWNlX21hcDogc3RyLAopOgogICAgIiIiCiAgICBnZXQgdGhlIGNvcnJlY3QgbW9kZWwgYW5kIHRva2VuaXplciBhY2NvcmRpbmcgdG8gZ2l2ZW4gdXNlciBpbnB1dHMKCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdGFzazogYSBzdXBwb3J0ZWQgbmxwIHRhc2ssIHVzZWQgdG8gY2hvb3NlIG1vZGVsIGlmIG5vdCBwcm92aWRlZAogICAgOnBhcmFtIGZyYW1ld29yazogcHQgb3IgdGYKICAgIDpwYXJhbSBxdWFudGl6YXRpb25fY29uZmlnOiBxdWFudGl6YXRpb24gY29uZmlnIG9yIE5vbmUsIHRvIGxvYWQgbW9kZWwgaW4gYXBwcm9wcmlhdGUgd2F5CiAgICA6cGFyYW0gdXNlX2N1ZGE6IHVzZSBncHUgb3Igbm90CiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkZXZpY2VfbWFwOiBhIGRldmljZSBtYXAgZm9yIG1vZGVsIHRyYWluaW5nIGlmIHVzaW5nIG51bWJlciBvZiBncHUncwoKICAgIDpyZXR1cm5zOiBtb2RlbCBhbmQgdG9rZW5pemVyCiAgICAiIiIKICAgICMgaWYgdGFzayBpcyBub3Qgc3VwcG9ydGVkIGFuZCBubyBtb2RlbCB3YXMgZ2l2ZW4gd2UgY2FuJ3QgY2hvb3NlIG9uZQogICAgaWYgdGFzayBhbmQgdGFzayBub3QgaW4gc3VwcG9ydGVkX3Rhc2tzIGFuZCBub3QgbW9kZWw6CiAgICAgICAgbG9nZ2VyLmVycm9yKCJ1bnN1cHBvcnRlZCB0YXNrIG9wdGlvbiBjaG9zZW4iKQogICAgICAgIHJhaXNlCgogICAgIyBsb2FkIG1vZGVsIGZyb20gc3RvcmUKICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cikgYW5kIGlzX3N0b3JlX3VyaShtb2RlbCk6CiAgICAgICAgcGFzcwogICAgICAgICMgVE9ETzogbG9hZCBib3RoIG1vZGVsIGFuZCB0b2tlbml6ZXIgYW5kIHJldHVybiwgbmVlZCBndXkncyBoZWxwCgogICAgIyBpZiBpdCdzIGEgdHVwbGUgdGhlbSB3ZSBhc3N1bWUgaXQgY29udGFpbnMgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgbGlzdCk6CiAgICAgICAgbW9kZWxfbmFtZSwgbW9kZWxfY2xhc3MgPSBtb2RlbAogICAgICAgIG1vZGVsX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QobW9kZWxfY2xhc3MpCgogICAgIyBpbiB0aGUgY2FzZSB3ZSBkb24ndCBnZXQgdGhlIG1vZGVsIGNsYXNzIHdlIG5lZWQgdGhlIHRhc2sgaW4gb3JkZXIgdG8gY2hvb3NlIHRoZSBjb3JyZWN0IG1vZGVsCiAgICBlbHNlOgogICAgICAgIGlmIHRhc2sgaXMgTm9uZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0YXNrIG11c3QgYmUgY2hvc2VuIGluIG9yZGVyIHRvIGRldGVybWluZSB0aGUgY29ycmVjdCBtb2RlbCIpCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBfLCBhdmFpbGFibGVfY2xhc3NlcywgdGFza19vcHRpb25zID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5jaGVja190YXNrKHRhc2spCgogICAgICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cik6CiAgICAgICAgICAgIG1vZGVsX25hbWUgPSBtb2RlbAoKICAgICAgICAjIGlmIG1vZGVsIGlzIG5vdCBnaXZlbiwgd2UgdGFrZSB0aGUgZGVmYXVsdCBtb2RlbCBmb3IgdGhlIGdpdmVuIHRhc2sKICAgICAgICBlbHNlOgogICAgICAgICAgICBtb2RlbF9uYW1lLCBfID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5nZXRfZGVmYXVsdF9tb2RlbF9hbmRfcmV2aXNpb24oCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfY2xhc3NlcywgZnJhbWV3b3JrLCB0YXNrX29wdGlvbnMKICAgICAgICAgICAgKQogICAgICAgIGlmIG5vdCBhdmFpbGFibGVfY2xhc3Nlcy5nZXQoZnJhbWV3b3JrLCB0dXBsZSgpKToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKAogICAgICAgICAgICAgICAgImdpdmVuIHRhc2sncyBkZWZhdWx0IG1vZGVsIGlzIG5vdCBzdXBwb3J0ZWQgaW4gc3BlY2lmaWVkIGZyYW1ld29yayIKICAgICAgICAgICAgKQogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oCiAgICAgICAgICAgICAgICAidGhpcyBmdW5jdGlvbiByZXF1aXJlcyBlaXRoZXIgYSBzdXBwb3J0ZWQgdGFzayBvciBhIG1vZGVsIGFuZCBtb2RlbCBjbGFzcyB0byBiZSBjaG9zZW4iCiAgICAgICAgICAgICkKCiAgICAgICAgbW9kZWxfY2xhc3MgPSBhdmFpbGFibGVfY2xhc3Nlc1tmcmFtZXdvcmtdWzBdCgogICAgIyBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICBpZiB1c2VfY3VkYToKICAgICAgICBkZXZpY2VfbWFwID0gZGV2aWNlX21hcAogICAgZWxzZToKICAgICAgICBkZXZpY2VfbWFwID0gTm9uZQoKICAgIG1vZGVsID0gbW9kZWxfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsCiAgICAgICAgcXVhbnRpemF0aW9uX2NvbmZpZz1xdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICAgICAqKm1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgKQoKICAgICMgSWYgcXVhbnRpemF0aW9uIGNvbmZpZyBpcyBnaXZlbiB3ZSB3aWxsIGxvYWQgYSBxdWFudGl6ZWQgbW9kZWwsIGlmIG5vdCBhIHJlZ3VsYXIgb25lCiAgICBpZiBxdWFudGl6YXRpb25fY29uZmlnOgogICAgICAgIG1vZGVsLmdyYWRpZW50X2NoZWNrcG9pbnRpbmdfZW5hYmxlKCkKICAgICAgICBtb2RlbCA9IHBlZnQucHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZyhtb2RlbCkKCiAgICAjIGlmIG5vdCBzcGVjaWZpZWQgd2UgY2hvb3NlIHRoZSBkZWZhdWx0IHRva2VuaXplciB0aGF0IGNvcnJlc3BvbmRpbmcgdG8gdGhlIG1vZGVsCiAgICBpZiB0b2tlbml6ZXIgaXMgTm9uZToKICAgICAgICB0b2tlbml6ZXIgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQobW9kZWxfbmFtZSkKICAgICAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKICAgIGlmIGlzaW5zdGFuY2UodG9rZW5pemVyLCBzdHIpOgogICAgICAgIHRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gdHJhbnNmb3JtZXJzLkF1dG9Ub2tlbml6ZXIKCiAgICAjIGlmIGl0J3Mgbm90IGEgc3RyIHRoZW4gaXQncyBhIHR1cGxlIG9mIGJvdGggbmFtZSBhbmQgY2xhc3MKICAgIGVsc2U6CiAgICAgICAgdG9rZW5pemVyX25hbWUsIHRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcgogICAgICAgIHRva2VuaXplcl9jbGFzcyA9IF9nZXRfY2xhc3Nfb2JqZWN0KHRva2VuaXplcl9jbGFzcykKCiAgICB0b2tlbml6ZXIgPSB0b2tlbml6ZXJfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIHRva2VuaXplcl9uYW1lLCAqKnRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZwogICAgKQoKICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgcmV0dXJuIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIKCgpkZWYgX2RhdGFzZXRfbG9hZGVyKGRhdGFzZXQ6IHN0ciwgaXNfdHJhaW46IGJvb2wgPSBUcnVlLCAqKmt3YXJncykgLT4gRGF0YXNldDoKICAgICIiIgogICAgbG9hZHMgdGhlIHNwZWNpZmljIGRhdGFzZXQgcHJvdmlkZWQgYnkgdGhlIHVzZXIKCiAgICA6cGFyYW0gZGF0YXNldDogbmFtZSBvciBwYXRoIG9mIGRhdGFzZXQgdG8gbG9hZAogICAgOnBhcmFtIGlzX3RyYWluOiBib29sIHRoYXQgaW5kaWNhdGVzIHRoZSBwdXJwb3NlIG9mIHRoZSBkYXRhc2V0CiAgICA6cGFyYW0ga3dhcmdzOiBvdGhlciBrd2FyZ3MgZm9yIGxvYWRpbmcgdGhlIGRhdGFzZXQKCiAgICA6cmV0dXJuczogbG9hZGVkIGRhdGFzZXQKICAgICIiIgogICAgIyBpZiBzcGxpdCBpbiBrd2FyZ3MgdGhlbiB0aGUgdXNlciBkZWNpZGVzIGhvdyB0byBzcGxpdCB0aGUgZGF0YXNldAogICAgaWYgInNwbGl0IiBpbiBrd2FyZ3M6CiAgICAgICAgcmV0dXJuIGxvYWRfZGF0YXNldChkYXRhc2V0LCAqKmt3YXJncykKCiAgICAjIGlmIGl0J3MgYSBkYXRhc2V0IGZvciB0cmFpbiB3ZSBzcGxpdCB3aXRoIHRyYWluCiAgICBpZiBpc190cmFpbjoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsIHNwbGl0PSJ0cmFpbiIsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBldmFsIGRhdGFzZXQsIHRoZW4gYSBsb3Qgb2YgbmFtZXMgYXJlIGFjY2VwdGFibGUgZm9yIHRoZSBzZXQgYW5kIHdlIGNoZWNrIGFsbCBvZiB0aGVtCiAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQogICAgaWYgInRlc3QiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgIGVsaWYgImV2YWwiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJldmFsIikKICAgIGVsaWYgInZhbGlkYXRpb24iIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIHRva2VuaXplciwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSwKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogd2hpY2ggY29sdW1ucyB0byBwYXNzIHRvIHRoZSBtb2RlbCBhcyBpbnB1dHMKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIChuZWVkIHRvIHBhc3MgdGhyb3VnaCB0aGUgdG9rZW5pemVyIGZpcnN0KQogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRva2VuaXplcjogdGhlIHRva2VuaXplciB0byBwYXNzIHRoZSBkYXRhIHRocm91Z2gKCiAgICA6cmV0dXJuczogdG9rZW5pemVkIGRhdGFzZXRzCiAgICAiIiIKICAgIGlmIG5vdCB0b2tlbml6ZXIucGFkX3Rva2VuOgogICAgICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgIyB3ZSB0YWtlIGNvbCBuYW1lL3MgaW4gYSBsaXN0IGZvciBlYXN5IGdlbmVyYWxpemF0aW9uCiAgICBpZiBpc2luc3RhbmNlKGRhdGFzZXRfY29sdW1uc190b190cmFpbiwgc3RyKToKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW4gPSBbZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXQoKICAgIGlmIGlzaW5zdGFuY2UodHJhaW5fZGF0YXNldCwgbWxydW4uZGF0YXN0b3JlLkRhdGFJdGVtKToKICAgICAgICB0cmFpbl9kYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyh0cmFpbl9kYXRhc2V0LmFzX2RmKCkpCiAgICAgICAgcmV0dXJuICgKICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICBsYW1iZGEgZXhhbXBsZXM6IHRva2VuaXplcigKICAgICAgICAgICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICBwYWRkaW5nPVRydWUsCiAgICAgICAgICAgICAgICApLAogICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICApLAogICAgICAgICAgICBOb25lLAogICAgICAgICkKCiAgICAjIExvYWQgZGF0YXNldHMKICAgICMgaWYgcHJvdmlkZWQgdHdvIHBhdGhzL25hbWVzIHdlIGxvYWQgZWFjaCBzZXBhcmF0ZWx5IHVzaW5nIGRlc2lnbmF0ZWQgZnVuYwogICAgaWYgZXZhbF9kYXRhc2V0OgogICAgICAgIHRyYWluX2RhdGFzZXQgPSBfZGF0YXNldF9sb2FkZXIoCiAgICAgICAgICAgIGRhdGFzZXQ9dHJhaW5fZGF0YXNldCwgaXNfdHJhaW49VHJ1ZSwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgICAgIGV2YWxfZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD1ldmFsX2RhdGFzZXQsIGlzX3RyYWluPUZhbHNlLCAqKmV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncwogICAgICAgICkKCiAgICAjIGlmIG9ubHkgb24gcGF0aCBpcyBnaXZlbiB0aGVuIHdlIG11c3QgY2hlY2sgaWYgaXQgY29udGFpbnMgYm90aCBkYXRhc2V0IG9yIGlmIG9ubHkgb25lIHNob3VsZCBiZSB1c2VkCiAgICBlbHNlOgogICAgICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQodHJhaW5fZGF0YXNldCwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzKQogICAgICAgIGlmICJ0cmFpbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgdHJhaW5fZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0cmFpbiIpCiAgICAgICAgICAgIGlmICJ0ZXN0IiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInRlc3QiKQogICAgICAgICAgICBlbGlmICJldmFsIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoImV2YWwiKQogICAgICAgICAgICBlbGlmICJ2YWxpZGF0aW9uIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgIyBvbmx5IHRyYWluIGRhdGFzZXQgZ2l2ZW4sIHRva2VuaXplIGFuZCByZXR1cm4gaXQKICAgICAgICAgICAgICAgIHJldHVybiAoCiAgICAgICAgICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICAgICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAgICAgICAgICAgICAgICAgKltleGFtcGxlc1tjb2xdIGZvciBjb2wgaW4gZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgKSwKICAgICAgICAgICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICAgICAgICAgICksCiAgICAgICAgICAgICAgICAgICAgTm9uZSwKICAgICAgICAgICAgICAgICkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRyYWluIGRhdGFzZXQgaXMgbWFuZGF0b3J5IikKICAgICAgICAgICAgcmFpc2UgS2V5RXJyb3IoIm5vIHRyYWluIGRhdGFzZXQgZm91bmQgaW4gZ2l2ZW4gZGF0YXNldCIpCgogICAgIyBUb2tlbml6ZSB0aGUgZGF0YSBzbyB0aGUgbW9kZWwgY2FuIHVuZGVyc3RhbmQgaXQKICAgIHRva2VuaXplZF90cmFpbl9kYXRhc2V0ID0gdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgbGFtYmRhIGV4YW1wbGVzOiB0b2tlbml6ZXIoCiAgICAgICAgICAgICpbZXhhbXBsZXNbY29sXSBmb3IgY29sIGluIGRhdGFzZXRfY29sdW1uc190b190cmFpbl0sCiAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgcGFkZGluZz1UcnVlLAogICAgICAgICksCiAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgKQoKICAgIHRva2VuaXplZF9ldmFsX2RhdGFzZXQgPSBldmFsX2RhdGFzZXQubWFwKAogICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICB0cnVuY2F0aW9uPVRydWUsCiAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICApLAogICAgICAgIGJhdGNoZWQ9VHJ1ZSwKICAgICkKCiAgICByZXR1cm4gdG9rZW5pemVkX3RyYWluX2RhdGFzZXQsIHRva2VuaXplZF9ldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSA9ICJ0ZXh0IiwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRmluZS10dW5lcyBhIExhbmd1YWdlIE1vZGVsIChMTE0pIG9uIGEgc3BlY2lmaWMgdGFzayB1c2luZyB0aGUgcHJvdmlkZWQgZGF0YXNldC4KICAgICBUaGUgZnVuY3Rpb24gdGFrZXMgdmFyaW91cyBjb25maWd1cmF0aW9uIHBhcmFtZXRlcnMgdG8gY3VzdG9taXplIHRoZSB0cmFpbmluZyBwcm9jZXNzCiAgICAgYW5kIGFkYXB0IHRoZSBtb2RlbCB0byBzcGVjaWZpYyB0YXNrcyB1c2luZyBhIHByb3ZpZGVkIGRhdGFzZXQuCgogICAgOnBhcmFtIGNvbnRleHQ6IG1scnVuIGNvbnRleHQgaW4gb3JkZXIgdG8gbG9nIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW46IHdoaWNoIGNvbHVtbnMgdG8gcGFzcyB0byB0aGUgbW9kZWwgYXMgaW5wdXRzCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZnJhbWV3b3JrOiBwdCBvdCB0ZgogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdHJhaW5fZGF0YXNldDogVGhlIHRyYWluIGRhdGFzZXQgdXNlZCBmb3IgZmluZS10dW5pbmcgdGhlIGxhbmd1YWdlIG1vZGVsLgogICAgOnBhcmFtIGV2YWxfZGF0YXNldDogVGhlIGV2YWwgZGF0YXNldCB1c2VkIGZvciBldmFsdWF0ZSB0aGUgbGFuZ3VhZ2UgbW9kZWwgZHVyaW5nIHRyYWluaW5nLgogICAgOnBhcmFtIGRlZXBzcGVlZF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgRGVlcFNwZWVkIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBtb2RlbCBxdWFudGl6YXRpb24gKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBsb3JhX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBMb3ctUmFuayBBcHByb3hpbWF0aW9uIChMb1JBKSAob3B0aW9uYWwpLgogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gZGF0YV9jb2xsYXRvcl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgZGF0YSBjb2xsYXRpb24gZHVyaW5nIHRyYWluaW5nIChvcHRpb25hbCkuCiAgICA6cGFyYW0gdGFzazogQSBkZXNjcmlwdGlvbiBvZiB0aGUgc3BlY2lmaWMgdGFzayB0aGUgbW9kZWwgaXMgYmVpbmcgZmluZS10dW5lZCBmb3IuCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBUT0RPOiBtYXRjaCBmb3J3YXJkLmtleXdvcmQgdG8gZGF0YXNldC5rZXl3b3JkIC0gY2hlY2sgaWYgcmVsZXZhbnQgaW4gbmV3IGRlc2lnbgogICAgIyBUT0RPOiBhZGQgd2FybmluZyBmb3IgbGFiZWwsIGFuZCBhZGQgb3B0aW9uIHRvIG1vZGlmeSBkYXRhc2V0IGNvbCBuYW1lcyAtIGNoZWNrIGlmIHJlbGV2YW50IGluIG5ldyBkZXNpZ24KCiAgICAjIExvb2sgZm9yIHVwZGF0ZXMgdG8gY29uZmlncyBnaXZlbiBpbiBrd2FyZ3MKICAgIGNvbmZpZ3MgPSB7CiAgICAgICAgQ29uZmlnS2V5cy5kZWVwc3BlZWQ6IGRlZXBzcGVlZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5xdWFudGl6YXRpb246IHF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50cmFpbmluZzogdHJhaW5pbmdfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZDogbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50b2tlbml6ZXJfcHJldHJhaW5lZDogdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuZGF0YV9jb2xsYXRvcjogZGF0YV9jb2xsYXRvcl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZzogcGVmdF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5iZXRhOiBiZXRhLAogICAgfQogICAgX3VwZGF0ZV9jb25maWcoZHN0PWNvbmZpZ3MsIHNyYz1rd2FyZ3MpCgogICAgIyBjaGVjayBncHUgcGVybWlzc2lvbiBhbmQgYXZhaWxhYmlsaXR5CiAgICBpZiB1c2VfY3VkYToKICAgICAgICBpZiB0b3JjaC5jdWRhLmlzX2F2YWlsYWJsZSgpOgogICAgICAgICAgICAjIENsZWFuIGdwdSBjYWNoZQogICAgICAgICAgICB0b3JjaC5jdWRhLmVtcHR5X2NhY2hlKCkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIud2FybmluZygiJ3VzZV9jdWRhJyBpcyBzZXQgdG8gVHJ1ZSwgYnV0IG5vIGN1ZGEgZGV2aWNlIGlzIGF2YWlsYWJsZSIpCgogICAgIyBnZXQgbW9kZWwgYW5kIHRva2VuaXplcgogICAgbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplciA9IF9zZXRfbW9kZWxfYW5kX3Rva2VuaXplcigKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHF1YW50aXphdGlvbl9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnF1YW50aXphdGlvbl0sCiAgICAgICAgdXNlX2N1ZGE9dXNlX2N1ZGEsCiAgICAgICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnPXRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZz1jb25maWdzW0NvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZF0sCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgKQoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgdG9rZW5pemVkX3RyYWluLCB0b2tlbml6ZWRfZXZhbCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncz10cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLAogICAgICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncz1ldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW49ZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluLAogICAgKQoKICAgICMgSW5pdGlhbGl6ZSB0aGUgZGF0YSBjb2xsYXRvciBmb3IgdGhlIHRyYWluZXIgdG8gdXNlIGluIG9yZGVyIHRvIGNyZWF0ZSBiYXRjaGVzIG9mIGRhdGEKICAgIGRhdGFfY29sbGF0b3IgPSB0cmFuc2Zvcm1lcnMuRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZygKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLCBtbG09RmFsc2UsICoqZGF0YV9jb2xsYXRvcl9jb25maWcKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IHRybC5EUE9UcmFpbmVyKAogICAgICAgIG1vZGVsPW1vZGVsLAogICAgICAgIHJlZl9tb2RlbCA9IE5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10b2tlbml6ZWRfdHJhaW4sCiAgICAgICAgZXZhbF9kYXRhc2V0PXRva2VuaXplZF9ldmFsLAogICAgICAgIHBlZnRfY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZ10sCiAgICAgICAgYmV0YSA9IGNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGRhdGFfY29sbGF0b3I9ZGF0YV9jb2xsYXRvciwKICAgICAgICBhcmdzPXRyYWluaW5nX2FyZ3MsCiAgICApCgogICAgYXBwbHlfbWxydW4odHJhaW5lciwgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdKQogICAgbW9kZWwuY29uZmlnLnVzZV9jYWNoZSA9ICgKICAgICAgICBGYWxzZSAgIyBzaWxlbmNlIHRoZSB3YXJuaW5ncy4gUGxlYXNlIHJlLWVuYWJsZSBmb3IgaW5mZXJlbmNlIQogICAgKQoKICAgICMgQXBwbHkgdHJhaW5pbmcgd2l0aCBldmFsdWF0aW9uOgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmInRyYWluaW5nICd7bW9kZWxfbmFtZX0nIikKICAgIHRyYWluZXIudHJhaW4oKQoKICAgIHRlbXBfZGlyZWN0b3J5ID0gdGVtcGZpbGUuVGVtcG9yYXJ5RGlyZWN0b3J5KCkubmFtZQogICAgdHJhaW5lci5zYXZlX21vZGVsKHRlbXBfZGlyZWN0b3J5KQoKICAgICMgWmlwIHRoZSBtb2RlbCBkaXJlY3Rvcnk6CiAgICBzaHV0aWwubWFrZV9hcmNoaXZlKAogICAgICAgIGJhc2VfbmFtZT0ibW9kZWwiLAogICAgICAgIGZvcm1hdD0iemlwIiwKICAgICAgICByb290X2Rpcj10ZW1wX2RpcmVjdG9yeSwKICAgICkKCiAgICAjIExvZyB0aGUgbW9kZWw6CiAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICBrZXk9Im1vZGVsIiwKICAgICAgICBkYl9rZXk9bW9kZWxfbmFtZS5zcGxpdCgiLyIpWy0xXSwKICAgICAgICBtb2RlbF9maWxlPSJtb2RlbC56aXAiLAogICAgICAgIHRhZz0iIiwKICAgICAgICBmcmFtZXdvcms9Ikh1Z2dpbmcgRmFjZSIsCiAgICApCgoKZGVmIGV2YWx1YXRlKAogICAgY29udGV4dCwKICAgIG1vZGVsX3BhdGgsCiAgICBkYXRhOiBwZC5EYXRhRnJhbWUsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgdG9rZW5pemVyX25hbWU6IHN0ciA9IE5vbmUsCik6CiAgICAiIiIKICAgIEV2YWx1YXRpbmcgdGhlIG1vZGVsIHVzaW5nIHBlcnBsZXhpdHksIGZvciBtb3JlIGluZm9ybWF0aW9uIHZpc2l0OgogICAgaHR0cHM6Ly9odWdnaW5nZmFjZS5jby9kb2NzL3RyYW5zZm9ybWVycy9wZXJwbGV4aXR5CgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICBtbHJ1biBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGF0aDogIHBhdGggdG8gdGhlIG1vZGVsIGRpcmVjdG9yeQogICAgOnBhcmFtIGRhdGE6ICAgICAgICB0aGUgZGF0YSB0byBldmFsdWF0ZSB0aGUgbW9kZWwKICAgIDpwYXJhbSBtb2RlbF9uYW1lOiAgbmFtZSBvZiBiYXNlIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6IG5hbWUgb2YgYmFzZSB0b2tlbml6ZXIKICAgICIiIgogICAgIyBHZXQgdGhlIG1vZGVsIGFydGlmYWN0IGFuZCBmaWxlOgogICAgKAogICAgICAgIG1vZGVsX2ZpbGUsCiAgICAgICAgbW9kZWxfYXJ0aWZhY3QsCiAgICAgICAgZXh0cmFfZGF0YSwKICAgICkgPSBtbHJ1bi5hcnRpZmFjdHMuZ2V0X21vZGVsKG1vZGVsX3BhdGgpCgogICAgIyBSZWFkIHRoZSBuYW1lOgogICAgX21vZGVsX25hbWUgPSBtb2RlbF9hcnRpZmFjdC5zcGVjLmRiX2tleQoKICAgICMgRXh0cmFjdCBsb2dnZWQgbW9kZWwgZmlsZXM6CiAgICBtb2RlbF9kaXJlY3RvcnkgPSBvcy5wYXRoLmpvaW4ob3MucGF0aC5kaXJuYW1lKG1vZGVsX2ZpbGUpLCBfbW9kZWxfbmFtZSkKICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX2ZpbGU6CiAgICAgICAgemlwX2ZpbGUuZXh0cmFjdGFsbChtb2RlbF9kaXJlY3RvcnkpCgogICAgIyBMb2FkaW5nIHRoZSBzYXZlZCBwcmV0cmFpbmVkIHRva2VuaXplciBhbmQgbW9kZWw6CiAgICBkYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyhkYXRhKQogICAgdG9rZW5pemVyID0gQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQodG9rZW5pemVyX25hbWUpCiAgICBwYWRfdG9rZW5faWQgPSB0b2tlbml6ZXIuZW9zX3Rva2VuX2lkCiAgICBtb2RlbCA9IEF1dG9Nb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCgKICAgICAgICBtb2RlbF9uYW1lLCBkZXZpY2VfbWFwPSJjdWRhOjAiLCB0cnVzdF9yZW1vdGVfY29kZT1UcnVlLCBsb2FkX2luXzhiaXQ9VHJ1ZQogICAgKQogICAgbW9kZWwgPSBQZWZ0TW9kZWwuZnJvbV9wcmV0cmFpbmVkKG1vZGVsLCBtb2RlbF9kaXJlY3RvcnkpCiAgICBtb2RlbC5ldmFsKCkKICAgIGVuY29kaW5ncyA9IHRva2VuaXplcigiXG5cbiIuam9pbihkYXRhc2V0WyJ0ZXh0Il1bOjVdKSwgcmV0dXJuX3RlbnNvcnM9InB0IikKCiAgICBtYXhfbGVuZ3RoID0gMTAyNAogICAgc3RyaWRlID0gNTEyCiAgICBzZXFfbGVuID0gZW5jb2RpbmdzLmlucHV0X2lkcy5zaXplKDEpCgogICAgbmxscyA9IFtdCiAgICBwcmV2X2VuZF9sb2MgPSAwCiAgICBmb3IgYmVnaW5fbG9jIGluIHJhbmdlKDAsIHNlcV9sZW4sIHN0cmlkZSk6CiAgICAgICAgZW5kX2xvYyA9IG1pbihiZWdpbl9sb2MgKyBtYXhfbGVuZ3RoLCBzZXFfbGVuKQogICAgICAgIHRyZ19sZW4gPSBlbmRfbG9jIC0gcHJldl9lbmRfbG9jICAjIG1heSBiZSBkaWZmZXJlbnQgZnJvbSBzdHJpZGUgb24gbGFzdCBsb29wCiAgICAgICAgaW5wdXRfaWRzID0gZW5jb2RpbmdzLmlucHV0X2lkc1s6LCBiZWdpbl9sb2M6ZW5kX2xvY10KICAgICAgICB0YXJnZXRfaWRzID0gaW5wdXRfaWRzLmNsb25lKCkKICAgICAgICB0YXJnZXRfaWRzWzosIDotdHJnX2xlbl0gPSAtMTAwCgogICAgICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgICAgICBvdXRwdXRzID0gbW9kZWwoaW5wdXRfaWRzLmN1ZGEoKSwgbGFiZWxzPXRhcmdldF9pZHMpCgogICAgICAgICAgICAjIGxvc3MgaXMgY2FsY3VsYXRlZCB1c2luZyBDcm9zc0VudHJvcHlMb3NzIHdoaWNoIGF2ZXJhZ2VzIG92ZXIgdmFsaWQgbGFiZWxzCiAgICAgICAgICAgICMgTi5CLiB0aGUgbW9kZWwgb25seSBjYWxjdWxhdGVzIGxvc3Mgb3ZlciB0cmdfbGVuIC0gMSBsYWJlbHMsIGJlY2F1c2UgaXQgaW50ZXJuYWxseSBzaGlmdHMgdGhlIGxhYmVscwogICAgICAgICAgICAjIHRvIHRoZSBsZWZ0IGJ5IDEuCiAgICAgICAgICAgIG5lZ19sb2dfbGlrZWxpaG9vZCA9IG91dHB1dHMubG9zcwoKICAgICAgICBubGxzLmFwcGVuZChuZWdfbG9nX2xpa2VsaWhvb2QpCgogICAgICAgIHByZXZfZW5kX2xvYyA9IGVuZF9sb2MKICAgICAgICBpZiBlbmRfbG9jID09IHNlcV9sZW46CiAgICAgICAgICAgIGJyZWFrCgogICAgcHBsID0gdG9yY2guZXhwKHRvcmNoLnN0YWNrKG5sbHMpLm1lYW4oKSkuaXRlbSgpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoInBlcnBsZXhpdHkiLCBwcGwpCg== + commands: [] + code_origin: '' + origin_filename: '' + requirements: [] + entry_points: + add_interface: + name: add_interface + doc: '' + parameters: + - name: cls + default: '' + - name: obj + type: DPOTrainer + default: '' + - name: restoration + type: MLRunInterfaceRestorationType + default: null + outputs: + - default: '' + lineno: 72 + mlrun_train: + name: mlrun_train + doc: '' + parameters: + - name: cls + default: '' + outputs: + - default: '' + lineno: 82 + wrapper: + name: wrapper + doc: '' + parameters: + - name: self + type: DPOTrainer + default: '' + outputs: + - default: '' + lineno: 83 + on_epoch_begin: + name: on_epoch_begin + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 131 + on_epoch_end: + name: on_epoch_end + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 142 + on_log: + name: on_log + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + - name: logs + type: Dict[str, float] + default: null + outputs: + - default: '' + lineno: 153 + on_train_begin: + name: on_train_begin + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 179 + on_train_end: + name: on_train_end + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + - name: model + type: PreTrainedModel + default: null + - name: tokenizer + type: PreTrainedTokenizer + default: null + outputs: + - default: '' + lineno: 190 + on_evaluate: + name: on_evaluate + doc: '' + parameters: + - name: self + default: '' + - name: args + type: TrainingArguments + default: '' + - name: state + type: TrainerState + default: '' + - name: control + type: TrainerControl + default: '' + outputs: + - default: '' + lineno: 203 + log_metrics: + name: log_metrics + doc: '' + parameters: + - name: self + default: '' + outputs: + - default: '' + lineno: 217 + log_metric_plot: + name: log_metric_plot + doc: '' + parameters: + - name: self + default: '' + - name: name + type: str + default: '' + - name: scores + type: List[float] + default: '' + outputs: + - default: '' + lineno: 224 + apply_mlrun: + name: apply_mlrun + doc: This is temporary and will be built in mlrun 1.5.0 + parameters: + - name: trainer + type: DPOTrainer + default: '' + - name: model_name + type: str + default: null + - name: tag + type: str + default: '' + - name: context + type: MLClientCtx + default: null + - name: auto_log + type: bool + default: true + - name: labels + type: Dict[str, str] + default: null + - name: extra_data + type: dict + default: null + outputs: + - default: '' + lineno: 246 + dpo_train: + name: dpo_train + doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ + \ dataset.\n The function takes various configuration parameters to customize\ + \ the training process\n and adapt the model to specific tasks using a provided\ + \ dataset." + parameters: + - name: context + type: MLClientCtx + doc: mlrun context in order to log trained model + default: '' + - name: train_dataset + type: Union[str, mlrun.datastore.DataItem] + doc: The train dataset used for fine-tuning the language model. + default: '' + - name: eval_dataset + type: str + doc: The eval dataset used for evaluate the language model during training. + default: null + - name: train_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: eval_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: dataset_columns_to_train + type: Union[str, list] + doc: which columns to pass to the model as inputs + default: text + - name: model + type: Union[str, List[str]] + doc: a tuple containing model name and class, or str with model name or path + default: huggingface-model + - name: tokenizer + type: Union[str, List[str]] + doc: a tuple containing tokenizer name and class, or str with tokenizer name + or path + default: null + - name: deepspeed_config + type: Union[dict, bool] + doc: Configuration options for DeepSpeed (optional). + default: false + - name: quantization_config + type: Union[dict, bool] + doc: Configuration options for model quantization (optional). + default: false + - name: peft_config + type: Union[dict, bool] + default: false + - name: beta + type: Union[float, bool] + default: false + - name: training_config + type: dict + doc: Configuration options specific to the fine-tuning training process (optional). + default: {} + - name: model_pretrained_config + type: dict + doc: config to load the pretrained model + default: {} + - name: tokenizer_pretrained_config + type: dict + doc: config to load the pretrained tokenizer + default: {} + - name: data_collator_config + type: dict + doc: Configuration options for data collation during training (optional). + default: {} + - name: task + type: str + doc: A description of the specific task the model is being fine-tuned for. + default: text-generation + - name: use_cuda + type: bool + doc: use gpu or not + default: true + - name: framework + type: str + doc: pt ot tf + default: pt + - name: device_map + type: str + default: auto + outputs: + - default: '' + lineno: 627 + evaluate: + name: evaluate + doc: 'Evaluating the model using perplexity, for more information visit: + + https://huggingface.co/docs/transformers/perplexity' + parameters: + - name: context + doc: mlrun context + default: '' + - name: model_path + doc: path to the model directory + default: '' + - name: data + type: DataFrame + doc: the data to evaluate the model + default: '' + - name: model_name + type: str + doc: name of base model + default: null + - name: tokenizer_name + type: str + doc: name of base tokenizer + default: null + outputs: + - default: '' + lineno: 785 + description: doing the alignment with dpo trainer + default_handler: dpo_train + disable_auto_mount: false + clone_target_dir: '' + env: [] + resources: + requests: + memory: 1Mi + cpu: 25m + limits: + memory: 20Gi + cpu: '2' + priority_class_name: igz-workload-medium + preemption_mode: prevent + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: app.iguazio.com/lifecycle + operator: NotIn + values: + - preemptible + - key: eks.amazonaws.com/capacityType + operator: NotIn + values: + - SPOT + - key: node-lifecycle + operator: NotIn + values: + - spot + tolerations: null + security_context: {} +verbose: false From b343632065b93709652d39314b93e6b2b59f94a5 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 19:20:18 +0000 Subject: [PATCH 16/33] rename the trainer --- huggingface_dpo/huggingface_dpo.py | 870 ++++++++++++++++++ huggingface_dpo/huggingface_dpo_trainer.py | 3 +- huggingface_dpo/test_huggingface_dpo.py | 56 ++ .../test_huggingface_dpo_trainer.py | 6 +- 4 files changed, 932 insertions(+), 3 deletions(-) create mode 100644 huggingface_dpo/huggingface_dpo.py create mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py new file mode 100644 index 000000000..8dcf63b29 --- /dev/null +++ b/huggingface_dpo/huggingface_dpo.py @@ -0,0 +1,870 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from trl import DPOTrainer +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + peft_config = "peft_config" + data_collator = "data_collator" + beta = "beta" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: DPOTrainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: DPOTrainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: trl.DPOTrainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +PEFT_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=16, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = PEFT_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def dpo_train( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + peft_config: Union[dict, bool] = False, + beta: Union[float, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + ConfigKeys.peft_config: peft_config, + ConfigKeys.beta: beta, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = trl.DPOTrainer( + model=model, + ref_model = None, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + peft_config=configs[ConfigKeys.peft_config], + beta = configs[ConfigKeys.beta], + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 5f2a680d0..0eb076dde 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -675,7 +675,8 @@ def dpo_train( # TODO: match forward.keyword to dataset.keyword - check if relevant in new design # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - + import pdb + pdb.set_trace() # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py new file mode 100644 index 000000000..b310aaf37 --- /dev/null +++ b/huggingface_dpo/test_huggingface_dpo.py @@ -0,0 +1,56 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile + +import mlrun + + +def test_dpo_train(): + + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + auto_trainer = mlrun.import_function("function.yaml") + + training_arguments = { + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 1, + "warmup_steps": 2, + "max_steps": 10, + "learning_rate": 2e-4, + "logging_steps": 1, + } + + params = { + "model": (model_name, "transformers.AutoModelForCausalLM"), + "tokenizer": tokenizer, + "train_dataset": "HuggingFaceH4/orca_dpo_pairs", + "training_config": training_arguments, + "dataset_columns_to_train": "quote", + "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, + } + + try: + with tempfile.TemporaryDirectory() as test_directory: + auto_trainer.run( + local=True, + params=params, + handler="dpo_train", + returns=["model"], + workdir=test_directory, + ) + + except Exception as exception: + print(f"- The training failed - raised the following error:\n- {exception}") diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 7899debba..d2cfaaf02 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -7,7 +7,7 @@ def test_dpo_train(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") + dop_trainer = mlrun.import_function("function.yaml") training_arguments = { "per_device_train_batch_size": 4, @@ -20,17 +20,19 @@ def test_dpo_train(): params = { "model": (model_name, "transformers.AutoModelForCausalLM"), + "ref_model": None, "tokenizer": tokenizer, "train_dataset": "Abirate/english_quotes", "training_config": training_arguments, "dataset_columns_to_train": "quote", "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, } try: with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( + dpo_trainer.run( local=True, params=params, handler="dpo_train", From 6b28938560829ca49cb11f8570550e26333a50c7 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 19 Mar 2024 19:21:00 +0000 Subject: [PATCH 17/33] get rid of the older version --- huggingface_dpo/huggingface_dpo.py | 870 ------------------------ huggingface_dpo/test_huggingface_dpo.py | 56 -- 2 files changed, 926 deletions(-) delete mode 100644 huggingface_dpo/huggingface_dpo.py delete mode 100644 huggingface_dpo/test_huggingface_dpo.py diff --git a/huggingface_dpo/huggingface_dpo.py b/huggingface_dpo/huggingface_dpo.py deleted file mode 100644 index 8dcf63b29..000000000 --- a/huggingface_dpo/huggingface_dpo.py +++ /dev/null @@ -1,870 +0,0 @@ -# Copyright 2024 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - peft_config = "peft_config" - data_collator = "data_collator" - beta = "beta" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: DPOTrainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: DPOTrainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: trl.DPOTrainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -PEFT_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=16, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = PEFT_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def dpo_train( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - peft_config: Union[dict, bool] = False, - beta: Union[float, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - ConfigKeys.peft_config: peft_config, - ConfigKeys.beta: beta, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = trl.DPOTrainer( - model=model, - ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) diff --git a/huggingface_dpo/test_huggingface_dpo.py b/huggingface_dpo/test_huggingface_dpo.py deleted file mode 100644 index b310aaf37..000000000 --- a/huggingface_dpo/test_huggingface_dpo.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2024 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile - -import mlrun - - -def test_dpo_train(): - - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "tokenizer": tokenizer, - "train_dataset": "HuggingFaceH4/orca_dpo_pairs", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( - local=True, - params=params, - handler="dpo_train", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") From 5239e5554a5553047c2f3f3b0fe67df5b75fcb7a Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 01:40:22 +0000 Subject: [PATCH 18/33] can trigger the run. seems don't need override the dataloader --- huggingface_dpo/huggingface_dpo_trainer.py | 53 ++++++++++++------- huggingface_dpo/requirements.txt | 1 + .../test_huggingface_dpo_trainer.py | 52 ++++++++++++++++++ 3 files changed, 87 insertions(+), 19 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 0eb076dde..64389c23c 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -1,3 +1,17 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import importlib import os import shutil @@ -244,7 +258,7 @@ def log_metric_plot(self, name: str, scores: List[float]): def apply_mlrun( - trainer: trl.DPOTrainer, + trainer: DPOTrainer, model_name: str = None, tag: str = "", context: mlrun.MLClientCtx = None, @@ -675,8 +689,6 @@ def dpo_train( # TODO: match forward.keyword to dataset.keyword - check if relevant in new design # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - import pdb - pdb.set_trace() # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, @@ -710,21 +722,24 @@ def dpo_train( model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - + whole_dataset = load_dataset(train_dataset, split='train') + whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3) + train_dataset = whole_dataset['train'] + eval_dataset = whole_dataset['test'] # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) + #tokenized_train, tokenized_eval = _prepare_dataset( + # train_dataset=train_dataset, + # eval_dataset=eval_dataset, + # train_load_dataset_kwargs=train_load_dataset_kwargs, + # eval_load_dataset_kwargs=eval_load_dataset_kwargs, + # tokenizer=tokenizer, + # dataset_columns_to_train=dataset_columns_to_train, + #) # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) + #data_collator = transformers.DataCollatorForLanguageModeling( + # tokenizer=tokenizer, mlm=False, **data_collator_config + #) # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] @@ -742,15 +757,15 @@ def dpo_train( **train_kwargs, ) - trainer = trl.DPOTrainer( + trainer = DPOTrainer( model=model, ref_model = None, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, + train_dataset=train_dataset, + eval_dataset=eval_dataset, peft_config=configs[ConfigKeys.peft_config], beta = configs[ConfigKeys.beta], tokenizer=tokenizer, - data_collator=data_collator, + #data_collator=data_collator, args=training_args, ) diff --git a/huggingface_dpo/requirements.txt b/huggingface_dpo/requirements.txt index 215b90562..c03846397 100644 --- a/huggingface_dpo/requirements.txt +++ b/huggingface_dpo/requirements.txt @@ -5,3 +5,4 @@ datasets plotly trl mlrun +bitsandbytes diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index d2cfaaf02..fcd373759 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -1,7 +1,59 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import tempfile +from huggingface_dpo_trainer import dpo_train import mlrun +def test_dpo_fn(): + ctx = mlrun.get_or_create_ctx(name='test_dpo') + train_dataset = "unalignment/toxic-dpo-v0.2" + training_arguments = { + "evaluation_strategy": "steps", + "do_eval": True, + "optim": "paged_adamw_8bit", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "log_level": "info", + "save_steps": 100, + "learning_rate": 5e-7, + "eval_steps": 100, + "num_train_epochs": 1, + "max_steps": 100, + "warmup_steps": 20, + "fp16": True, + "lr_scheduler_type": "cosine", + "remove_unused_columns": True, + "gradient_checkpointing": True, + } + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + dpo_train( + context = ctx, + train_dataset = train_dataset, + model = (model_name,"transformers.AutoModelForCausalLM"), + tokenizer = tokenizer, + dataset_columns_to_train = ['chosen', 'rejected'], + training_config = training_arguments, + use_cuda = True, + beta = 0.1, + split='train', + ) + + def test_dpo_train(): From 1f059b891203de9da7f365d546a622310c8e6722 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 13:07:15 -0700 Subject: [PATCH 19/33] adding the maxlength --- huggingface_dpo/test_huggingface_dpo_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index fcd373759..6a434f7dc 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -51,6 +51,8 @@ def test_dpo_fn(): use_cuda = True, beta = 0.1, split='train', + max_length=1024, + max_prompt_length=2048, ) From e5d079249016d264090d9840736dc261203b05ae Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 15:42:16 -0700 Subject: [PATCH 20/33] get rid of the trainer interface --- huggingface_dpo/huggingface_dpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 64389c23c..349d98e1b 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -37,7 +37,7 @@ from plotly import graph_objects as go from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, + PreTrainedModel, PreTrainedTokenizer, TrainerCallback, TrainerControl, TrainerState, TrainingArguments) From eb300fd531888062b6f387d4c760a15735b3fa45 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 20 Mar 2024 22:43:04 +0000 Subject: [PATCH 21/33] override --- huggingface_dpo/huggingface_dpo_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 64389c23c..844d6b69d 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -767,6 +767,8 @@ def dpo_train( tokenizer=tokenizer, #data_collator=data_collator, args=training_args, + max_length=1024, + max_prompt_length=2048, ) apply_mlrun(trainer, model_name=model_name.split("/")[-1]) From 14be77620ce1bfef106bc5dbf3aedaccd3372945 Mon Sep 17 00:00:00 2001 From: peng wei Date: Tue, 26 Mar 2024 21:51:09 +0000 Subject: [PATCH 22/33] training job can run but the artifact can't store --- huggingface_dpo/huggingface_dpo_trainer.py | 30 ++++--------------- .../test_huggingface_dpo_trainer.py | 16 +++++----- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index eddd74e8c..e50cb64af 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -41,19 +41,6 @@ TrainerCallback, TrainerControl, TrainerState, TrainingArguments) -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - class ConfigKeys: deepspeed = "deepspeed" @@ -61,7 +48,7 @@ class ConfigKeys: training = "training" tokenizer_pretrained = "tokenizer_pretrained" model_pretrained = "model_pretrained" - peft_config = "peft_config" + peft_config = "peft" data_collator = "data_collator" beta = "beta" @@ -317,7 +304,7 @@ def _print_trainable_parameters(model): ) PEFT_CONFIG = peft.LoraConfig( - r=8, + r=16, lora_alpha=16, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], @@ -360,7 +347,7 @@ def _update_config(src: dict, dst: dict): if config is True and config_name == "quantization": config = QUANTIZATION_CONFIG - if config is True and config_name == "lora": + if config is True and config_name == "peft": config = PEFT_CONFIG if config is True and config_name == "deepspeed": @@ -423,11 +410,6 @@ def _set_model_and_tokenizer( :returns: model and tokenizer """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - # load model from store if isinstance(model, str) and is_store_uri(model): pass @@ -702,6 +684,7 @@ def dpo_train( } _update_config(dst=configs, src=kwargs) + # check gpu permission and availability if use_cuda: if torch.cuda.is_available(): @@ -765,10 +748,9 @@ def dpo_train( peft_config=configs[ConfigKeys.peft_config], beta = configs[ConfigKeys.beta], tokenizer=tokenizer, - #data_collator=data_collator, args=training_args, - max_length=1024, - max_prompt_length=2048, + max_length=2048, + max_prompt_length=4096, ) apply_mlrun(trainer, model_name=model_name.split("/")[-1]) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 6a434f7dc..1f3a9a772 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -20,6 +20,7 @@ def test_dpo_fn(): ctx = mlrun.get_or_create_ctx(name='test_dpo') train_dataset = "unalignment/toxic-dpo-v0.2" + training_arguments = { "evaluation_strategy": "steps", "do_eval": True, @@ -28,12 +29,12 @@ def test_dpo_fn(): "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 100, + "save_steps": 2, "learning_rate": 5e-7, - "eval_steps": 100, + "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 100, - "warmup_steps": 20, + "max_steps": 10, + "warmup_steps": 5, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, @@ -44,15 +45,12 @@ def test_dpo_fn(): dpo_train( context = ctx, train_dataset = train_dataset, - model = (model_name,"transformers.AutoModelForCausalLM"), + peft_config=True, + model = model_name, tokenizer = tokenizer, - dataset_columns_to_train = ['chosen', 'rejected'], training_config = training_arguments, use_cuda = True, beta = 0.1, - split='train', - max_length=1024, - max_prompt_length=2048, ) From 8ed0555aea873e0bc4e840f64cbd6e58bcae9b2f Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 27 Mar 2024 00:02:26 +0000 Subject: [PATCH 23/33] why the artifact can be stored? --- huggingface_dpo/huggingface_dpo_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index e50cb64af..9f5c00e19 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -241,7 +241,9 @@ def log_metric_plot(self, name: str, scores: List[float]): # Create the plotly artifact: artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) + import pdb + pdb.set_trace() + #self._artifacts[artifact_name] = self._context.log_artifact(artifact) def apply_mlrun( From 465c2087cb8a630c703f4a66ec7bfa6770965b32 Mon Sep 17 00:00:00 2001 From: peng wei Date: Wed, 27 Mar 2024 21:36:52 +0000 Subject: [PATCH 24/33] solved the naming issue, now can store the artifact --- huggingface_dpo/huggingface_dpo_trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 9f5c00e19..fa534f631 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -239,11 +239,11 @@ def log_metric_plot(self, name: str, scores: List[float]): ) # Create the plotly artifact: + if '/' in name: + name = '_'.join(name.split('/')) artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - import pdb - pdb.set_trace() - #self._artifacts[artifact_name] = self._context.log_artifact(artifact) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) def apply_mlrun( From 308d94f4bb08ab9a2b3170891bdfed70b9b507b6 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 04:59:04 +0000 Subject: [PATCH 25/33] testing --- huggingface_dpo/huggingface_dpo_trainer.py | 115 +++--------------- .../test_huggingface_dpo_trainer.py | 17 +-- 2 files changed, 29 insertions(+), 103 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index fa534f631..823f83148 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -384,8 +384,6 @@ def _get_class_object(class_path: str) -> type: module_path, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_path) return getattr(module, class_name) - - def _set_model_and_tokenizer( model: Union[str, List[str]], tokenizer: Union[str, List[str]], @@ -490,7 +488,6 @@ def _set_model_and_tokenizer( return model_name, model, tokenizer - def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: """ loads the specific dataset provided by the user @@ -517,6 +514,7 @@ def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: return dataset.get("eval") elif "validation" in dataset: return dataset.get("validation") + return dataset def _prepare_dataset( @@ -524,8 +522,6 @@ def _prepare_dataset( eval_dataset: str, train_load_dataset_kwargs, eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], ) -> (Dataset, Union[Dataset, None]): """ Loads the train and eval datasets (if provided) passes them through the tokenizer and @@ -533,34 +529,11 @@ def _prepare_dataset( :param train_dataset: the name or path to the train dataset :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) :param train_load_dataset_kwargs: kwargs for dataset loading :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through :returns: tokenized datasets """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) # Load datasets # if provided two paths/names we load each separately using designated func @@ -571,7 +544,6 @@ def _prepare_dataset( eval_dataset = _dataset_loader( dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs ) - # if only on path is given then we must check if it contains both dataset or if only one should be used else: dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) @@ -584,42 +556,13 @@ def _prepare_dataset( elif "validation" in dataset: eval_dataset = dataset.get("validation") else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) + return train_dataset else: logger.error("train dataset is mandatory") raise KeyError("no train dataset found in given dataset") - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset + return train_dataset, eval_dataset def dpo_train( @@ -628,7 +571,6 @@ def dpo_train( eval_dataset: str = None, train_load_dataset_kwargs: dict = {}, eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", model: Union[str, List[str]] = "huggingface-model", tokenizer: Union[str, List[str]] = None, deepspeed_config: Union[dict, bool] = False, @@ -637,8 +579,8 @@ def dpo_train( beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config : dict={}, task: str = "text-generation", use_cuda: bool = True, framework: str = "pt", @@ -646,33 +588,31 @@ def dpo_train( **kwargs, ): """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + Form a dpo training job to do llm alignment The function takes various configuration parameters to customize the training process and adapt the model to specific tasks using a provided dataset. :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path :param train_dataset: The train dataset used for fine-tuning the language model. :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path :param deepspeed_config: Configuration options for DeepSpeed (optional). :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param peft_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param beta: super parameter of KL divergence :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param model_pretrained_config: config to load the pretrained model + :param tokenizer_pretrained_config: config to load the pretrained tokenizer :param data_collator_config: Configuration options for data collation during training (optional). :param task: A description of the specific task the model is being fine-tuned for. + :param use_cuda: use gpu or not + :param framework: pt ot tf :param kwargs: Additional keyword arguments. """ - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design # Look for updates to configs given in kwargs configs = { ConfigKeys.deepspeed: deepspeed_config, @@ -699,33 +639,16 @@ def dpo_train( model_name, model, tokenizer = _set_model_and_tokenizer( model=model, tokenizer=tokenizer, - task=task, framework=framework, + task = task, quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - whole_dataset = load_dataset(train_dataset, split='train') - whole_dataset = whole_dataset.shuffle(seed=42).train_test_split(seed=42, test_size=.3) - train_dataset = whole_dataset['train'] - eval_dataset = whole_dataset['test'] - # Load datasets - #tokenized_train, tokenized_eval = _prepare_dataset( - # train_dataset=train_dataset, - # eval_dataset=eval_dataset, - # train_load_dataset_kwargs=train_load_dataset_kwargs, - # eval_load_dataset_kwargs=eval_load_dataset_kwargs, - # tokenizer=tokenizer, - # dataset_columns_to_train=dataset_columns_to_train, - #) - - # Initialize the data collator for the trainer to use in order to create batches of data - #data_collator = transformers.DataCollatorForLanguageModeling( - # tokenizer=tokenizer, mlm=False, **data_collator_config - #) - + train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs) + # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 1f3a9a772..64ec36886 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -18,40 +18,43 @@ import mlrun def test_dpo_fn(): + model_name = "mistralai/Mistral-7B-Instruct-v0.2" + tokenizer = model_name + #dop_trainer = mlrun.import_function("function.yaml") + ctx = mlrun.get_or_create_ctx(name='test_dpo') train_dataset = "unalignment/toxic-dpo-v0.2" - + eval_dataset = "unalignment/toxic-dpo-v0.2" training_arguments = { "evaluation_strategy": "steps", - "do_eval": True, + "do_eval": False, "optim": "paged_adamw_8bit", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 2, + "save_steps": 5, "learning_rate": 5e-7, "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 10, + "max_steps": 5, "warmup_steps": 5, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, "gradient_checkpointing": True, } - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name dpo_train( context = ctx, train_dataset = train_dataset, + eval_dataset = eval_dataset, peft_config=True, model = model_name, tokenizer = tokenizer, training_config = training_arguments, use_cuda = True, beta = 0.1, - ) + ) From b660dd7134437cfcf0bccc60263cabc80d37eea4 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:00:57 +0000 Subject: [PATCH 26/33] fmt --- huggingface_dpo/huggingface_dpo_trainer.py | 56 +++++++++++------ .../test_huggingface_dpo_trainer.py | 61 +++++++++---------- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.py b/huggingface_dpo/huggingface_dpo_trainer.py index 823f83148..1f5154a7b 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.py +++ b/huggingface_dpo/huggingface_dpo_trainer.py @@ -32,14 +32,20 @@ from mlrun.frameworks._common import CommonTypes, MLRunInterface from mlrun.utils import logger from trl import DPOTrainer -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + PreTrainedModel, + PreTrainedTokenizer, + TrainerCallback, + TrainerControl, + TrainerState, + TrainingArguments, +) class ConfigKeys: @@ -239,8 +245,8 @@ def log_metric_plot(self, name: str, scores: List[float]): ) # Create the plotly artifact: - if '/' in name: - name = '_'.join(name.split('/')) + if "/" in name: + name = "_".join(name.split("/")) artifact_name = f"{name}_plot" artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) self._artifacts[artifact_name] = self._context.log_artifact(artifact) @@ -308,8 +314,15 @@ def _print_trainable_parameters(model): PEFT_CONFIG = peft.LoraConfig( r=16, lora_alpha=16, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"], + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", @@ -384,6 +397,8 @@ def _get_class_object(class_path: str) -> type: module_path, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_path) return getattr(module, class_name) + + def _set_model_and_tokenizer( model: Union[str, List[str]], tokenizer: Union[str, List[str]], @@ -488,6 +503,7 @@ def _set_model_and_tokenizer( return model_name, model, tokenizer + def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: """ loads the specific dataset provided by the user @@ -561,7 +577,6 @@ def _prepare_dataset( logger.error("train dataset is mandatory") raise KeyError("no train dataset found in given dataset") - return train_dataset, eval_dataset @@ -579,8 +594,8 @@ def dpo_train( beta: Union[float, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config : dict={}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, task: str = "text-generation", use_cuda: bool = True, framework: str = "pt", @@ -626,7 +641,6 @@ def dpo_train( } _update_config(dst=configs, src=kwargs) - # check gpu permission and availability if use_cuda: if torch.cuda.is_available(): @@ -640,15 +654,17 @@ def dpo_train( model=model, tokenizer=tokenizer, framework=framework, - task = task, + task=task, quantization_config=configs[ConfigKeys.quantization], use_cuda=use_cuda, tokenizer_pretrained_config=tokenizer_pretrained_config, model_pretrained_config=configs[ConfigKeys.model_pretrained], device_map=device_map, ) - train_dataset, eval_dataset = _prepare_dataset(train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs) - + train_dataset, eval_dataset = _prepare_dataset( + train_dataset, eval_dataset, train_load_dataset_kwargs, eval_load_dataset_kwargs + ) + # Initialize training kwargs from user kwargs: train_kwargs = configs[ConfigKeys.training] @@ -667,11 +683,11 @@ def dpo_train( trainer = DPOTrainer( model=model, - ref_model = None, + ref_model=None, train_dataset=train_dataset, eval_dataset=eval_dataset, peft_config=configs[ConfigKeys.peft_config], - beta = configs[ConfigKeys.beta], + beta=configs[ConfigKeys.beta], tokenizer=tokenizer, args=training_args, max_length=2048, diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 64ec36886..f073aafb5 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -17,46 +17,46 @@ import mlrun + def test_dpo_fn(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - #dop_trainer = mlrun.import_function("function.yaml") + # dop_trainer = mlrun.import_function("function.yaml") - ctx = mlrun.get_or_create_ctx(name='test_dpo') + ctx = mlrun.get_or_create_ctx(name="test_dpo") train_dataset = "unalignment/toxic-dpo-v0.2" eval_dataset = "unalignment/toxic-dpo-v0.2" training_arguments = { - "evaluation_strategy": "steps", - "do_eval": False, - "optim": "paged_adamw_8bit", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 1, - "log_level": "info", - "save_steps": 5, - "learning_rate": 5e-7, - "eval_steps": 1, - "num_train_epochs": 1, - "max_steps": 5, - "warmup_steps": 5, - "fp16": True, - "lr_scheduler_type": "cosine", - "remove_unused_columns": True, - "gradient_checkpointing": True, - } + "evaluation_strategy": "steps", + "do_eval": False, + "optim": "paged_adamw_8bit", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "log_level": "info", + "save_steps": 5, + "learning_rate": 5e-7, + "eval_steps": 1, + "num_train_epochs": 1, + "max_steps": 5, + "warmup_steps": 5, + "fp16": True, + "lr_scheduler_type": "cosine", + "remove_unused_columns": True, + "gradient_checkpointing": True, + } dpo_train( - context = ctx, - train_dataset = train_dataset, - eval_dataset = eval_dataset, - peft_config=True, - model = model_name, - tokenizer = tokenizer, - training_config = training_arguments, - use_cuda = True, - beta = 0.1, + context=ctx, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=True, + model=model_name, + tokenizer=tokenizer, + training_config=training_arguments, + use_cuda=True, + beta=0.1, ) - def test_dpo_train(): @@ -81,7 +81,6 @@ def test_dpo_train(): "training_config": training_arguments, "dataset_columns_to_train": "quote", "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, } From 3fe14517bead761f82e68ca7e8e07940a422c48e Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:13:27 +0000 Subject: [PATCH 27/33] update the function yaml file --- huggingface_dpo/function.yaml | 45 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/huggingface_dpo/function.yaml b/huggingface_dpo/function.yaml index d0baab33a..c3593fa63 100644 --- a/huggingface_dpo/function.yaml +++ b/huggingface_dpo/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: huggingface-dpo-trainer tag: '' - hash: 3db0dab27e7aaa2f91a96c2545060cc7e1a15676 + hash: 584b20584f58bfa89225b6999e6b55ad017dd87a project: '' labels: author: pgw @@ -14,7 +14,7 @@ spec: args: [] image: mlrun/mlrun build: - functionSourceCode: aW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgKExvcmFDb25maWcsIFBlZnRNb2RlbCwgZ2V0X3BlZnRfbW9kZWwsCiAgICAgICAgICAgICAgICAgIHByZXBhcmVfbW9kZWxfZm9yX2tiaXRfdHJhaW5pbmcpCmZyb20gcGxvdGx5IGltcG9ydCBncmFwaF9vYmplY3RzIGFzIGdvCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCAoQXV0b01vZGVsRm9yQ2F1c2FsTE0sIEF1dG9Ub2tlbml6ZXIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgQml0c0FuZEJ5dGVzQ29uZmlnLCBEYXRhQ29sbGF0b3JGb3JMYW5ndWFnZU1vZGVsaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgIFByZVRyYWluZWRNb2RlbCwgUHJlVHJhaW5lZFRva2VuaXplciwgVHJhaW5lciwKICAgICAgICAgICAgICAgICAgICAgICAgICBUcmFpbmVyQ2FsbGJhY2ssIFRyYWluZXJDb250cm9sLCBUcmFpbmVyU3RhdGUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgVHJhaW5pbmdBcmd1bWVudHMpCgpzdXBwb3J0ZWRfdGFza3MgPSBbCiAgICAicXVlc3Rpb24tYW5zd2VyaW5nIiwKICAgICJzdW1tYXJpemF0aW9uIiwKICAgICJ0YWJsZS1xdWVzdGlvbi1hbnN3ZXJpbmciLAogICAgInRleHQydGV4dC1nZW5lcmF0aW9uIiwKICAgICJ0ZXh0LWNsYXNzaWZpY2F0aW9uIiwKICAgICJzZW50aW1lbnQtYW5hbHlzaXMiLAogICAgInRleHQtZ2VuZXJhdGlvbiIsCiAgICAidG9rZW4tY2xhc3NpZmljYXRpb24iLAogICAgInRyYW5zbGF0aW9uIiwKICAgICJ0cmFuc2xhdGlvbl94eF90b195eSIsCl0KCgpjbGFzcyBDb25maWdLZXlzOgogICAgZGVlcHNwZWVkID0gImRlZXBzcGVlZCIKICAgIHF1YW50aXphdGlvbiA9ICJxdWFudGl6YXRpb24iCiAgICB0cmFpbmluZyA9ICJ0cmFpbmluZyIKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkID0gInRva2VuaXplcl9wcmV0cmFpbmVkIgogICAgbW9kZWxfcHJldHJhaW5lZCA9ICJtb2RlbF9wcmV0cmFpbmVkIgogICAgcGVmdF9jb25maWcgPSAicGVmdF9jb25maWciCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiB0cmwuRFBPVHJhaW5lciwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0YWc6IHN0ciA9ICIiLAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHggPSBOb25lLAogICAgYXV0b19sb2c6IGJvb2wgPSBUcnVlLAogICAgbGFiZWxzOiBEaWN0W3N0ciwgc3RyXSA9IE5vbmUsCiAgICBleHRyYV9kYXRhOiBkaWN0ID0gTm9uZSwKICAgICoqa3dhcmdzLAopOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgIiIiCiAgICAjIEdldCBwYXJhbWV0ZXJzIGRlZmF1bHRzOgogICAgaWYgY29udGV4dCBpcyBOb25lOgogICAgICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5ERUZBVUxUX0NPTlRFWFRfTkFNRSkKCiAgICBIRlRyYWluZXJNTFJ1bkludGVyZmFjZS5hZGRfaW50ZXJmYWNlKG9iaj10cmFpbmVyKQoKICAgIGlmIGF1dG9fbG9nOgogICAgICAgIHRyYWluZXIuYWRkX2NhbGxiYWNrKAogICAgICAgICAgICBNTFJ1bkNhbGxiYWNrKAogICAgICAgICAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgICAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgICAgICAgICAgdGFnPXRhZywKICAgICAgICAgICAgICAgIGxhYmVscz1sYWJlbHMsCiAgICAgICAgICAgICAgICBleHRyYV9kYXRhPWV4dHJhX2RhdGEsCiAgICAgICAgICAgICkKICAgICAgICApCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZW5kIGZyb20gTUxSVU4tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQoKCmRlZiBfcHJpbnRfdHJhaW5hYmxlX3BhcmFtZXRlcnMobW9kZWwpOgogICAgIiIiCiAgICBQcmludHMgdGhlIG51bWJlciBvZiB0cmFpbmFibGUgcGFyYW1ldGVycyBpbiB0aGUgbW9kZWwuCiAgICAiIiIKICAgIHRyYWluYWJsZV9wYXJhbXMgPSAwCiAgICBhbGxfcGFyYW0gPSAwCiAgICBmb3IgXywgcGFyYW0gaW4gbW9kZWwubmFtZWRfcGFyYW1ldGVycygpOgogICAgICAgIGFsbF9wYXJhbSArPSBwYXJhbS5udW1lbCgpCiAgICAgICAgaWYgcGFyYW0ucmVxdWlyZXNfZ3JhZDoKICAgICAgICAgICAgdHJhaW5hYmxlX3BhcmFtcyArPSBwYXJhbS5udW1lbCgpCiAgICBwcmludCgKICAgICAgICBmInRyYWluYWJsZSBwYXJhbXM6IHt0cmFpbmFibGVfcGFyYW1zfSB8fCBhbGwgcGFyYW1zOiB7YWxsX3BhcmFtfSB8fCB0cmFpbmFibGUlOiIKICAgICAgICBmIiB7MTAwICogdHJhaW5hYmxlX3BhcmFtcyAvIGFsbF9wYXJhbX0iCiAgICApCgoKIyBkZWZhdWx0IGNvbmZpZ3MKIyB3aWxsIGJlIHVzZWQgaWYgdXNlciBwcm92aWRlcyAiVHJ1ZSIgd2l0aCBjb25maWcgbmFtZSBhcyBpbnB1dApRVUFOVElaQVRJT05fQ09ORklHID0gdHJhbnNmb3JtZXJzLkJpdHNBbmRCeXRlc0NvbmZpZygKICAgIGxvYWRfaW5fNGJpdD1UcnVlLAogICAgYm5iXzRiaXRfdXNlX2RvdWJsZV9xdWFudD1UcnVlLAogICAgYm5iXzRiaXRfcXVhbnRfdHlwZT0ibmY0IiwKICAgIGJuYl80Yml0X2NvbXB1dGVfZHR5cGU9dG9yY2guYmZsb2F0MTYsCikKClBFRlRfQ09ORklHID0gcGVmdC5Mb3JhQ29uZmlnKAogICAgcj04LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsicV9wcm9qIiwgImtfcHJvaiIsICJ2X3Byb2oiLCAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwgInVwX3Byb2oiLCAiZG93bl9wcm9qIl0sCiAgICBsb3JhX2Ryb3BvdXQ9MC4wNSwKICAgIGJpYXM9Im5vbmUiLAogICAgdGFza190eXBlPSJDQVVTQUxfTE0iLAopCgpERUVQU1BFRURfQ09ORklHID0gewogICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICJhdXRvIiwKICAgICJmcDE2IjogeyJlbmFibGVkIjogVHJ1ZX0sCiAgICAiYXV0b3R1bmluZyI6IHsKICAgICAgICAiZW5hYmxlZCI6IFRydWUsCiAgICAgICAgImFyZ19tYXBwaW5ncyI6IHsKICAgICAgICAgICAgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6ICItLXBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSIsCiAgICAgICAgICAgICJncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMgIjogIi0tZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzIiwKICAgICAgICB9LAogICAgfSwKICAgICJ6ZXJvX29wdGltaXphdGlvbiI6IHsKICAgICAgICAic3RhZ2UiOiAyLAogICAgfSwKfQoKCmRlZiBfdXBkYXRlX2NvbmZpZyhzcmM6IGRpY3QsIGRzdDogZGljdCk6CiAgICAiIiIKICAgIHVwZGF0ZSBjb25maWdzIGFjY29yZGluZyB0byB1c2VyLCB0aGlzIHdheSB0aGUgdXNlciBjYW4gYWRkL21vZGlmeSB2YWx1ZXMgaW4gZGVmYXVsdCBjb25maWdzIGZvciBlLmcuCgogICAgZ29lcyBvdmVyIGFsbCBjb25maWdzIGFuZCBjb3JyZXNwb25kaW5nIHByZWZpeGVzLCBjb2xsZWN0IGFsbCB0aGUga2V5cyBmcm9tIHRoZSBnaXZlbiBkaWN0IHRoYXQgc3RhcnQKICAgICB3aXRoIHRoZSBwcmVmaXggYW5kIGFkZCB0aGVtIHRvIGFwcHJvcHJpYXRlIGNvbmZpZwoKICAgIDpwYXJhbSBzcmM6IGRpY3Qgb2YgYWxsIGNhbmRpZGF0ZSB2YWx1ZXMgdG8gdXBkYXRlIGRpY3QuCiAgICA6cGFyYW0gZHN0OiBkaWN0IGNvbnRhaW5pbmcgYWxsIGNvbmZpZ3MgdG8gdXBkYXRlLgogICAgIiIiCgogICAgZm9yIGNvbmZpZ19uYW1lLCBjb25maWcgaW4gZHN0Lml0ZW1zKCk6CgogICAgICAgICMgSWYgZ2l2ZW4gVHJ1ZSB3ZSB1c2UgZGVmYXVsdCBkaWN0CiAgICAgICAgIyBDYW4gYWxzbyBiZSBGYWxzZSBvciBhIGNvbmZpZyBkaWN0IGdpdmVuIGZyb20gdXNlciwgc28gd2UgY2hlY2sgc3BlY2lmaWNhbGx5IGZvIFRydWUKICAgICAgICBpZiBjb25maWcgaXMgVHJ1ZSBhbmQgY29uZmlnX25hbWUgPT0gInF1YW50aXphdGlvbiI6CiAgICAgICAgICAgIGNvbmZpZyA9IFFVQU5USVpBVElPTl9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJsb3JhIjoKICAgICAgICAgICAgY29uZmlnID0gUEVGVF9DT05GSUcKCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJkZWVwc3BlZWQiOgogICAgICAgICAgICBjb25maWcgPSBERUVQU1BFRURfQ09ORklHCgogICAgICAgICMgaW4gc29tZSBjYXNlcyB3ZSBjYW4gZ2V0IGEgYm9vbGVhbiB2YWx1ZSwgaW4gdGhhdCBjYXNlIG5vIG5lZWQgdG8gbG9vayBmb3IgYXJncwogICAgICAgIGlmIGlzaW5zdGFuY2UoY29uZmlnLCBib29sKToKICAgICAgICAgICAgY29uZmlnID0gTm9uZQoKICAgICAgICBlbGlmIGlzaW5zdGFuY2UoY29uZmlnLCBkaWN0KToKICAgICAgICAgICAgZm9yIGtleSwgdmFsIGluIHNyYy5pdGVtcygpOgogICAgICAgICAgICAgICAgaWYga2V5LnN0YXJ0c3dpdGgoY29uZmlnX25hbWUpOgogICAgICAgICAgICAgICAgICAgIGNvbmZpZ1trZXkucmVwbGFjZShmIntjb25maWdfbmFtZX1fIiwgIiIpXSA9IHZhbAoKICAgICAgICAjIHVwZGF0ZSBieSBjb25maWcgbmFtZQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBzZXRhdHRyKGNvbmZpZywga2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKSwgdmFsKQoKICAgICAgICBkc3QudXBkYXRlKHtjb25maWdfbmFtZTogY29uZmlnfSkKCgpkZWYgX2dldF9jbGFzc19vYmplY3QoY2xhc3NfcGF0aDogc3RyKSAtPiB0eXBlOgogICAgIiIiCiAgICBnaXZlbiBhIGZ1bGwgY2xhc3MgbmFtZSwgdGhpcyBmdW5jdGlvbiByZXR1cm5zIHRoZSBjb3JyZWN0IGNsYXNzCgogICAgOnBhcmFtIGNsYXNzX3BhdGg6IGEgZnVsbCBjbGFzcyBuYW1lIChleC4gJ3RyYW5zZm9ybWVycy5BdXRvTW9kZWxGb3JDYXVzYWxMTScpCgogICAgOnJldHVybiB0aGUgd2FudGVkIGNsYXNzIG9iamVjdAogICAgIiIiCiAgICBtb2R1bGVfcGF0aCwgY2xhc3NfbmFtZSA9IGNsYXNzX3BhdGgucnNwbGl0KCIuIiwgMSkKICAgIG1vZHVsZSA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKG1vZHVsZV9wYXRoKQogICAgcmV0dXJuIGdldGF0dHIobW9kdWxlLCBjbGFzc19uYW1lKQoKCmRlZiBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICBtb2RlbDogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0sCiAgICB0YXNrOiBzdHIsCiAgICBmcmFtZXdvcms6IHN0ciwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IGRpY3QsCiAgICB1c2VfY3VkYTogYm9vbCwKICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgIG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgZGV2aWNlX21hcDogc3RyLAopOgogICAgIiIiCiAgICBnZXQgdGhlIGNvcnJlY3QgbW9kZWwgYW5kIHRva2VuaXplciBhY2NvcmRpbmcgdG8gZ2l2ZW4gdXNlciBpbnB1dHMKCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdGFzazogYSBzdXBwb3J0ZWQgbmxwIHRhc2ssIHVzZWQgdG8gY2hvb3NlIG1vZGVsIGlmIG5vdCBwcm92aWRlZAogICAgOnBhcmFtIGZyYW1ld29yazogcHQgb3IgdGYKICAgIDpwYXJhbSBxdWFudGl6YXRpb25fY29uZmlnOiBxdWFudGl6YXRpb24gY29uZmlnIG9yIE5vbmUsIHRvIGxvYWQgbW9kZWwgaW4gYXBwcm9wcmlhdGUgd2F5CiAgICA6cGFyYW0gdXNlX2N1ZGE6IHVzZSBncHUgb3Igbm90CiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkZXZpY2VfbWFwOiBhIGRldmljZSBtYXAgZm9yIG1vZGVsIHRyYWluaW5nIGlmIHVzaW5nIG51bWJlciBvZiBncHUncwoKICAgIDpyZXR1cm5zOiBtb2RlbCBhbmQgdG9rZW5pemVyCiAgICAiIiIKICAgICMgaWYgdGFzayBpcyBub3Qgc3VwcG9ydGVkIGFuZCBubyBtb2RlbCB3YXMgZ2l2ZW4gd2UgY2FuJ3QgY2hvb3NlIG9uZQogICAgaWYgdGFzayBhbmQgdGFzayBub3QgaW4gc3VwcG9ydGVkX3Rhc2tzIGFuZCBub3QgbW9kZWw6CiAgICAgICAgbG9nZ2VyLmVycm9yKCJ1bnN1cHBvcnRlZCB0YXNrIG9wdGlvbiBjaG9zZW4iKQogICAgICAgIHJhaXNlCgogICAgIyBsb2FkIG1vZGVsIGZyb20gc3RvcmUKICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cikgYW5kIGlzX3N0b3JlX3VyaShtb2RlbCk6CiAgICAgICAgcGFzcwogICAgICAgICMgVE9ETzogbG9hZCBib3RoIG1vZGVsIGFuZCB0b2tlbml6ZXIgYW5kIHJldHVybiwgbmVlZCBndXkncyBoZWxwCgogICAgIyBpZiBpdCdzIGEgdHVwbGUgdGhlbSB3ZSBhc3N1bWUgaXQgY29udGFpbnMgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgbGlzdCk6CiAgICAgICAgbW9kZWxfbmFtZSwgbW9kZWxfY2xhc3MgPSBtb2RlbAogICAgICAgIG1vZGVsX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QobW9kZWxfY2xhc3MpCgogICAgIyBpbiB0aGUgY2FzZSB3ZSBkb24ndCBnZXQgdGhlIG1vZGVsIGNsYXNzIHdlIG5lZWQgdGhlIHRhc2sgaW4gb3JkZXIgdG8gY2hvb3NlIHRoZSBjb3JyZWN0IG1vZGVsCiAgICBlbHNlOgogICAgICAgIGlmIHRhc2sgaXMgTm9uZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0YXNrIG11c3QgYmUgY2hvc2VuIGluIG9yZGVyIHRvIGRldGVybWluZSB0aGUgY29ycmVjdCBtb2RlbCIpCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBfLCBhdmFpbGFibGVfY2xhc3NlcywgdGFza19vcHRpb25zID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5jaGVja190YXNrKHRhc2spCgogICAgICAgIGlmIGlzaW5zdGFuY2UobW9kZWwsIHN0cik6CiAgICAgICAgICAgIG1vZGVsX25hbWUgPSBtb2RlbAoKICAgICAgICAjIGlmIG1vZGVsIGlzIG5vdCBnaXZlbiwgd2UgdGFrZSB0aGUgZGVmYXVsdCBtb2RlbCBmb3IgdGhlIGdpdmVuIHRhc2sKICAgICAgICBlbHNlOgogICAgICAgICAgICBtb2RlbF9uYW1lLCBfID0gdHJhbnNmb3JtZXJzLnBpcGVsaW5lcy5nZXRfZGVmYXVsdF9tb2RlbF9hbmRfcmV2aXNpb24oCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfY2xhc3NlcywgZnJhbWV3b3JrLCB0YXNrX29wdGlvbnMKICAgICAgICAgICAgKQogICAgICAgIGlmIG5vdCBhdmFpbGFibGVfY2xhc3Nlcy5nZXQoZnJhbWV3b3JrLCB0dXBsZSgpKToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKAogICAgICAgICAgICAgICAgImdpdmVuIHRhc2sncyBkZWZhdWx0IG1vZGVsIGlzIG5vdCBzdXBwb3J0ZWQgaW4gc3BlY2lmaWVkIGZyYW1ld29yayIKICAgICAgICAgICAgKQogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oCiAgICAgICAgICAgICAgICAidGhpcyBmdW5jdGlvbiByZXF1aXJlcyBlaXRoZXIgYSBzdXBwb3J0ZWQgdGFzayBvciBhIG1vZGVsIGFuZCBtb2RlbCBjbGFzcyB0byBiZSBjaG9zZW4iCiAgICAgICAgICAgICkKCiAgICAgICAgbW9kZWxfY2xhc3MgPSBhdmFpbGFibGVfY2xhc3Nlc1tmcmFtZXdvcmtdWzBdCgogICAgIyBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICBpZiB1c2VfY3VkYToKICAgICAgICBkZXZpY2VfbWFwID0gZGV2aWNlX21hcAogICAgZWxzZToKICAgICAgICBkZXZpY2VfbWFwID0gTm9uZQoKICAgIG1vZGVsID0gbW9kZWxfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsCiAgICAgICAgcXVhbnRpemF0aW9uX2NvbmZpZz1xdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICAgICAqKm1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgKQoKICAgICMgSWYgcXVhbnRpemF0aW9uIGNvbmZpZyBpcyBnaXZlbiB3ZSB3aWxsIGxvYWQgYSBxdWFudGl6ZWQgbW9kZWwsIGlmIG5vdCBhIHJlZ3VsYXIgb25lCiAgICBpZiBxdWFudGl6YXRpb25fY29uZmlnOgogICAgICAgIG1vZGVsLmdyYWRpZW50X2NoZWNrcG9pbnRpbmdfZW5hYmxlKCkKICAgICAgICBtb2RlbCA9IHBlZnQucHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZyhtb2RlbCkKCiAgICAjIGlmIG5vdCBzcGVjaWZpZWQgd2UgY2hvb3NlIHRoZSBkZWZhdWx0IHRva2VuaXplciB0aGF0IGNvcnJlc3BvbmRpbmcgdG8gdGhlIG1vZGVsCiAgICBpZiB0b2tlbml6ZXIgaXMgTm9uZToKICAgICAgICB0b2tlbml6ZXIgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQobW9kZWxfbmFtZSkKICAgICAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKICAgIGlmIGlzaW5zdGFuY2UodG9rZW5pemVyLCBzdHIpOgogICAgICAgIHRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gdHJhbnNmb3JtZXJzLkF1dG9Ub2tlbml6ZXIKCiAgICAjIGlmIGl0J3Mgbm90IGEgc3RyIHRoZW4gaXQncyBhIHR1cGxlIG9mIGJvdGggbmFtZSBhbmQgY2xhc3MKICAgIGVsc2U6CiAgICAgICAgdG9rZW5pemVyX25hbWUsIHRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcgogICAgICAgIHRva2VuaXplcl9jbGFzcyA9IF9nZXRfY2xhc3Nfb2JqZWN0KHRva2VuaXplcl9jbGFzcykKCiAgICB0b2tlbml6ZXIgPSB0b2tlbml6ZXJfY2xhc3MuZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIHRva2VuaXplcl9uYW1lLCAqKnRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZwogICAgKQoKICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgcmV0dXJuIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIKCgpkZWYgX2RhdGFzZXRfbG9hZGVyKGRhdGFzZXQ6IHN0ciwgaXNfdHJhaW46IGJvb2wgPSBUcnVlLCAqKmt3YXJncykgLT4gRGF0YXNldDoKICAgICIiIgogICAgbG9hZHMgdGhlIHNwZWNpZmljIGRhdGFzZXQgcHJvdmlkZWQgYnkgdGhlIHVzZXIKCiAgICA6cGFyYW0gZGF0YXNldDogbmFtZSBvciBwYXRoIG9mIGRhdGFzZXQgdG8gbG9hZAogICAgOnBhcmFtIGlzX3RyYWluOiBib29sIHRoYXQgaW5kaWNhdGVzIHRoZSBwdXJwb3NlIG9mIHRoZSBkYXRhc2V0CiAgICA6cGFyYW0ga3dhcmdzOiBvdGhlciBrd2FyZ3MgZm9yIGxvYWRpbmcgdGhlIGRhdGFzZXQKCiAgICA6cmV0dXJuczogbG9hZGVkIGRhdGFzZXQKICAgICIiIgogICAgIyBpZiBzcGxpdCBpbiBrd2FyZ3MgdGhlbiB0aGUgdXNlciBkZWNpZGVzIGhvdyB0byBzcGxpdCB0aGUgZGF0YXNldAogICAgaWYgInNwbGl0IiBpbiBrd2FyZ3M6CiAgICAgICAgcmV0dXJuIGxvYWRfZGF0YXNldChkYXRhc2V0LCAqKmt3YXJncykKCiAgICAjIGlmIGl0J3MgYSBkYXRhc2V0IGZvciB0cmFpbiB3ZSBzcGxpdCB3aXRoIHRyYWluCiAgICBpZiBpc190cmFpbjoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsIHNwbGl0PSJ0cmFpbiIsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBldmFsIGRhdGFzZXQsIHRoZW4gYSBsb3Qgb2YgbmFtZXMgYXJlIGFjY2VwdGFibGUgZm9yIHRoZSBzZXQgYW5kIHdlIGNoZWNrIGFsbCBvZiB0aGVtCiAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQogICAgaWYgInRlc3QiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgIGVsaWYgImV2YWwiIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJldmFsIikKICAgIGVsaWYgInZhbGlkYXRpb24iIGluIGRhdGFzZXQ6CiAgICAgICAgcmV0dXJuIGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIHRva2VuaXplciwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSwKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogd2hpY2ggY29sdW1ucyB0byBwYXNzIHRvIHRoZSBtb2RlbCBhcyBpbnB1dHMKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIChuZWVkIHRvIHBhc3MgdGhyb3VnaCB0aGUgdG9rZW5pemVyIGZpcnN0KQogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRva2VuaXplcjogdGhlIHRva2VuaXplciB0byBwYXNzIHRoZSBkYXRhIHRocm91Z2gKCiAgICA6cmV0dXJuczogdG9rZW5pemVkIGRhdGFzZXRzCiAgICAiIiIKICAgIGlmIG5vdCB0b2tlbml6ZXIucGFkX3Rva2VuOgogICAgICAgIHRva2VuaXplci5wYWRfdG9rZW4gPSB0b2tlbml6ZXIuZW9zX3Rva2VuCgogICAgIyB3ZSB0YWtlIGNvbCBuYW1lL3MgaW4gYSBsaXN0IGZvciBlYXN5IGdlbmVyYWxpemF0aW9uCiAgICBpZiBpc2luc3RhbmNlKGRhdGFzZXRfY29sdW1uc190b190cmFpbiwgc3RyKToKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW4gPSBbZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXQoKICAgIGlmIGlzaW5zdGFuY2UodHJhaW5fZGF0YXNldCwgbWxydW4uZGF0YXN0b3JlLkRhdGFJdGVtKToKICAgICAgICB0cmFpbl9kYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyh0cmFpbl9kYXRhc2V0LmFzX2RmKCkpCiAgICAgICAgcmV0dXJuICgKICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICBsYW1iZGEgZXhhbXBsZXM6IHRva2VuaXplcigKICAgICAgICAgICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICBwYWRkaW5nPVRydWUsCiAgICAgICAgICAgICAgICApLAogICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICApLAogICAgICAgICAgICBOb25lLAogICAgICAgICkKCiAgICAjIExvYWQgZGF0YXNldHMKICAgICMgaWYgcHJvdmlkZWQgdHdvIHBhdGhzL25hbWVzIHdlIGxvYWQgZWFjaCBzZXBhcmF0ZWx5IHVzaW5nIGRlc2lnbmF0ZWQgZnVuYwogICAgaWYgZXZhbF9kYXRhc2V0OgogICAgICAgIHRyYWluX2RhdGFzZXQgPSBfZGF0YXNldF9sb2FkZXIoCiAgICAgICAgICAgIGRhdGFzZXQ9dHJhaW5fZGF0YXNldCwgaXNfdHJhaW49VHJ1ZSwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgICAgIGV2YWxfZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD1ldmFsX2RhdGFzZXQsIGlzX3RyYWluPUZhbHNlLCAqKmV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncwogICAgICAgICkKCiAgICAjIGlmIG9ubHkgb24gcGF0aCBpcyBnaXZlbiB0aGVuIHdlIG11c3QgY2hlY2sgaWYgaXQgY29udGFpbnMgYm90aCBkYXRhc2V0IG9yIGlmIG9ubHkgb25lIHNob3VsZCBiZSB1c2VkCiAgICBlbHNlOgogICAgICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQodHJhaW5fZGF0YXNldCwgKip0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzKQogICAgICAgIGlmICJ0cmFpbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgdHJhaW5fZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0cmFpbiIpCiAgICAgICAgICAgIGlmICJ0ZXN0IiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInRlc3QiKQogICAgICAgICAgICBlbGlmICJldmFsIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoImV2YWwiKQogICAgICAgICAgICBlbGlmICJ2YWxpZGF0aW9uIiBpbiBkYXRhc2V0OgogICAgICAgICAgICAgICAgZXZhbF9kYXRhc2V0ID0gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgIyBvbmx5IHRyYWluIGRhdGFzZXQgZ2l2ZW4sIHRva2VuaXplIGFuZCByZXR1cm4gaXQKICAgICAgICAgICAgICAgIHJldHVybiAoCiAgICAgICAgICAgICAgICAgICAgdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgICAgICAgICAgICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAgICAgICAgICAgICAgICAgKltleGFtcGxlc1tjb2xdIGZvciBjb2wgaW4gZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICAgICAgICAgICAgICAgICAgKSwKICAgICAgICAgICAgICAgICAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgICAgICAgICAgICAgICAgICksCiAgICAgICAgICAgICAgICAgICAgTm9uZSwKICAgICAgICAgICAgICAgICkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRyYWluIGRhdGFzZXQgaXMgbWFuZGF0b3J5IikKICAgICAgICAgICAgcmFpc2UgS2V5RXJyb3IoIm5vIHRyYWluIGRhdGFzZXQgZm91bmQgaW4gZ2l2ZW4gZGF0YXNldCIpCgogICAgIyBUb2tlbml6ZSB0aGUgZGF0YSBzbyB0aGUgbW9kZWwgY2FuIHVuZGVyc3RhbmQgaXQKICAgIHRva2VuaXplZF90cmFpbl9kYXRhc2V0ID0gdHJhaW5fZGF0YXNldC5tYXAoCiAgICAgICAgbGFtYmRhIGV4YW1wbGVzOiB0b2tlbml6ZXIoCiAgICAgICAgICAgICpbZXhhbXBsZXNbY29sXSBmb3IgY29sIGluIGRhdGFzZXRfY29sdW1uc190b190cmFpbl0sCiAgICAgICAgICAgIHRydW5jYXRpb249VHJ1ZSwKICAgICAgICAgICAgcGFkZGluZz1UcnVlLAogICAgICAgICksCiAgICAgICAgYmF0Y2hlZD1UcnVlLAogICAgKQoKICAgIHRva2VuaXplZF9ldmFsX2RhdGFzZXQgPSBldmFsX2RhdGFzZXQubWFwKAogICAgICAgIGxhbWJkYSBleGFtcGxlczogdG9rZW5pemVyKAogICAgICAgICAgICAqW2V4YW1wbGVzW2NvbF0gZm9yIGNvbCBpbiBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW5dLAogICAgICAgICAgICB0cnVuY2F0aW9uPVRydWUsCiAgICAgICAgICAgIHBhZGRpbmc9VHJ1ZSwKICAgICAgICApLAogICAgICAgIGJhdGNoZWQ9VHJ1ZSwKICAgICkKCiAgICByZXR1cm4gdG9rZW5pemVkX3RyYWluX2RhdGFzZXQsIHRva2VuaXplZF9ldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIGRhdGFzZXRfY29sdW1uc190b190cmFpbjogVW5pb25bc3RyLCBsaXN0XSA9ICJ0ZXh0IiwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRmluZS10dW5lcyBhIExhbmd1YWdlIE1vZGVsIChMTE0pIG9uIGEgc3BlY2lmaWMgdGFzayB1c2luZyB0aGUgcHJvdmlkZWQgZGF0YXNldC4KICAgICBUaGUgZnVuY3Rpb24gdGFrZXMgdmFyaW91cyBjb25maWd1cmF0aW9uIHBhcmFtZXRlcnMgdG8gY3VzdG9taXplIHRoZSB0cmFpbmluZyBwcm9jZXNzCiAgICAgYW5kIGFkYXB0IHRoZSBtb2RlbCB0byBzcGVjaWZpYyB0YXNrcyB1c2luZyBhIHByb3ZpZGVkIGRhdGFzZXQuCgogICAgOnBhcmFtIGNvbnRleHQ6IG1scnVuIGNvbnRleHQgaW4gb3JkZXIgdG8gbG9nIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW46IHdoaWNoIGNvbHVtbnMgdG8gcGFzcyB0byB0aGUgbW9kZWwgYXMgaW5wdXRzCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZnJhbWV3b3JrOiBwdCBvdCB0ZgogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyOiBhIHR1cGxlIGNvbnRhaW5pbmcgdG9rZW5pemVyIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCB0b2tlbml6ZXIgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gbW9kZWw6IGEgdHVwbGUgY29udGFpbmluZyBtb2RlbCBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggbW9kZWwgbmFtZSBvciBwYXRoCiAgICA6cGFyYW0gdHJhaW5fZGF0YXNldDogVGhlIHRyYWluIGRhdGFzZXQgdXNlZCBmb3IgZmluZS10dW5pbmcgdGhlIGxhbmd1YWdlIG1vZGVsLgogICAgOnBhcmFtIGV2YWxfZGF0YXNldDogVGhlIGV2YWwgZGF0YXNldCB1c2VkIGZvciBldmFsdWF0ZSB0aGUgbGFuZ3VhZ2UgbW9kZWwgZHVyaW5nIHRyYWluaW5nLgogICAgOnBhcmFtIGRlZXBzcGVlZF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgRGVlcFNwZWVkIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBtb2RlbCBxdWFudGl6YXRpb24gKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBsb3JhX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBMb3ctUmFuayBBcHByb3hpbWF0aW9uIChMb1JBKSAob3B0aW9uYWwpLgogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gZGF0YV9jb2xsYXRvcl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgZGF0YSBjb2xsYXRpb24gZHVyaW5nIHRyYWluaW5nIChvcHRpb25hbCkuCiAgICA6cGFyYW0gdGFzazogQSBkZXNjcmlwdGlvbiBvZiB0aGUgc3BlY2lmaWMgdGFzayB0aGUgbW9kZWwgaXMgYmVpbmcgZmluZS10dW5lZCBmb3IuCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBUT0RPOiBtYXRjaCBmb3J3YXJkLmtleXdvcmQgdG8gZGF0YXNldC5rZXl3b3JkIC0gY2hlY2sgaWYgcmVsZXZhbnQgaW4gbmV3IGRlc2lnbgogICAgIyBUT0RPOiBhZGQgd2FybmluZyBmb3IgbGFiZWwsIGFuZCBhZGQgb3B0aW9uIHRvIG1vZGlmeSBkYXRhc2V0IGNvbCBuYW1lcyAtIGNoZWNrIGlmIHJlbGV2YW50IGluIG5ldyBkZXNpZ24KCiAgICAjIExvb2sgZm9yIHVwZGF0ZXMgdG8gY29uZmlncyBnaXZlbiBpbiBrd2FyZ3MKICAgIGNvbmZpZ3MgPSB7CiAgICAgICAgQ29uZmlnS2V5cy5kZWVwc3BlZWQ6IGRlZXBzcGVlZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5xdWFudGl6YXRpb246IHF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50cmFpbmluZzogdHJhaW5pbmdfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZDogbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy50b2tlbml6ZXJfcHJldHJhaW5lZDogdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuZGF0YV9jb2xsYXRvcjogZGF0YV9jb2xsYXRvcl9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZzogcGVmdF9jb25maWcsCiAgICAgICAgQ29uZmlnS2V5cy5iZXRhOiBiZXRhLAogICAgfQogICAgX3VwZGF0ZV9jb25maWcoZHN0PWNvbmZpZ3MsIHNyYz1rd2FyZ3MpCgogICAgIyBjaGVjayBncHUgcGVybWlzc2lvbiBhbmQgYXZhaWxhYmlsaXR5CiAgICBpZiB1c2VfY3VkYToKICAgICAgICBpZiB0b3JjaC5jdWRhLmlzX2F2YWlsYWJsZSgpOgogICAgICAgICAgICAjIENsZWFuIGdwdSBjYWNoZQogICAgICAgICAgICB0b3JjaC5jdWRhLmVtcHR5X2NhY2hlKCkKICAgICAgICBlbHNlOgogICAgICAgICAgICBsb2dnZXIud2FybmluZygiJ3VzZV9jdWRhJyBpcyBzZXQgdG8gVHJ1ZSwgYnV0IG5vIGN1ZGEgZGV2aWNlIGlzIGF2YWlsYWJsZSIpCgogICAgIyBnZXQgbW9kZWwgYW5kIHRva2VuaXplcgogICAgbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplciA9IF9zZXRfbW9kZWxfYW5kX3Rva2VuaXplcigKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHF1YW50aXphdGlvbl9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnF1YW50aXphdGlvbl0sCiAgICAgICAgdXNlX2N1ZGE9dXNlX2N1ZGEsCiAgICAgICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnPXRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZz1jb25maWdzW0NvbmZpZ0tleXMubW9kZWxfcHJldHJhaW5lZF0sCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgKQoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgdG9rZW5pemVkX3RyYWluLCB0b2tlbml6ZWRfZXZhbCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncz10cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLAogICAgICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncz1ldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBkYXRhc2V0X2NvbHVtbnNfdG9fdHJhaW49ZGF0YXNldF9jb2x1bW5zX3RvX3RyYWluLAogICAgKQoKICAgICMgSW5pdGlhbGl6ZSB0aGUgZGF0YSBjb2xsYXRvciBmb3IgdGhlIHRyYWluZXIgdG8gdXNlIGluIG9yZGVyIHRvIGNyZWF0ZSBiYXRjaGVzIG9mIGRhdGEKICAgIGRhdGFfY29sbGF0b3IgPSB0cmFuc2Zvcm1lcnMuRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZygKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLCBtbG09RmFsc2UsICoqZGF0YV9jb2xsYXRvcl9jb25maWcKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IHRybC5EUE9UcmFpbmVyKAogICAgICAgIG1vZGVsPW1vZGVsLAogICAgICAgIHJlZl9tb2RlbCA9IE5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10b2tlbml6ZWRfdHJhaW4sCiAgICAgICAgZXZhbF9kYXRhc2V0PXRva2VuaXplZF9ldmFsLAogICAgICAgIHBlZnRfY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5wZWZ0X2NvbmZpZ10sCiAgICAgICAgYmV0YSA9IGNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGRhdGFfY29sbGF0b3I9ZGF0YV9jb2xsYXRvciwKICAgICAgICBhcmdzPXRyYWluaW5nX2FyZ3MsCiAgICApCgogICAgYXBwbHlfbWxydW4odHJhaW5lciwgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdKQogICAgbW9kZWwuY29uZmlnLnVzZV9jYWNoZSA9ICgKICAgICAgICBGYWxzZSAgIyBzaWxlbmNlIHRoZSB3YXJuaW5ncy4gUGxlYXNlIHJlLWVuYWJsZSBmb3IgaW5mZXJlbmNlIQogICAgKQoKICAgICMgQXBwbHkgdHJhaW5pbmcgd2l0aCBldmFsdWF0aW9uOgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmInRyYWluaW5nICd7bW9kZWxfbmFtZX0nIikKICAgIHRyYWluZXIudHJhaW4oKQoKICAgIHRlbXBfZGlyZWN0b3J5ID0gdGVtcGZpbGUuVGVtcG9yYXJ5RGlyZWN0b3J5KCkubmFtZQogICAgdHJhaW5lci5zYXZlX21vZGVsKHRlbXBfZGlyZWN0b3J5KQoKICAgICMgWmlwIHRoZSBtb2RlbCBkaXJlY3Rvcnk6CiAgICBzaHV0aWwubWFrZV9hcmNoaXZlKAogICAgICAgIGJhc2VfbmFtZT0ibW9kZWwiLAogICAgICAgIGZvcm1hdD0iemlwIiwKICAgICAgICByb290X2Rpcj10ZW1wX2RpcmVjdG9yeSwKICAgICkKCiAgICAjIExvZyB0aGUgbW9kZWw6CiAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICBrZXk9Im1vZGVsIiwKICAgICAgICBkYl9rZXk9bW9kZWxfbmFtZS5zcGxpdCgiLyIpWy0xXSwKICAgICAgICBtb2RlbF9maWxlPSJtb2RlbC56aXAiLAogICAgICAgIHRhZz0iIiwKICAgICAgICBmcmFtZXdvcms9Ikh1Z2dpbmcgRmFjZSIsCiAgICApCgoKZGVmIGV2YWx1YXRlKAogICAgY29udGV4dCwKICAgIG1vZGVsX3BhdGgsCiAgICBkYXRhOiBwZC5EYXRhRnJhbWUsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgdG9rZW5pemVyX25hbWU6IHN0ciA9IE5vbmUsCik6CiAgICAiIiIKICAgIEV2YWx1YXRpbmcgdGhlIG1vZGVsIHVzaW5nIHBlcnBsZXhpdHksIGZvciBtb3JlIGluZm9ybWF0aW9uIHZpc2l0OgogICAgaHR0cHM6Ly9odWdnaW5nZmFjZS5jby9kb2NzL3RyYW5zZm9ybWVycy9wZXJwbGV4aXR5CgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICBtbHJ1biBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGF0aDogIHBhdGggdG8gdGhlIG1vZGVsIGRpcmVjdG9yeQogICAgOnBhcmFtIGRhdGE6ICAgICAgICB0aGUgZGF0YSB0byBldmFsdWF0ZSB0aGUgbW9kZWwKICAgIDpwYXJhbSBtb2RlbF9uYW1lOiAgbmFtZSBvZiBiYXNlIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6IG5hbWUgb2YgYmFzZSB0b2tlbml6ZXIKICAgICIiIgogICAgIyBHZXQgdGhlIG1vZGVsIGFydGlmYWN0IGFuZCBmaWxlOgogICAgKAogICAgICAgIG1vZGVsX2ZpbGUsCiAgICAgICAgbW9kZWxfYXJ0aWZhY3QsCiAgICAgICAgZXh0cmFfZGF0YSwKICAgICkgPSBtbHJ1bi5hcnRpZmFjdHMuZ2V0X21vZGVsKG1vZGVsX3BhdGgpCgogICAgIyBSZWFkIHRoZSBuYW1lOgogICAgX21vZGVsX25hbWUgPSBtb2RlbF9hcnRpZmFjdC5zcGVjLmRiX2tleQoKICAgICMgRXh0cmFjdCBsb2dnZWQgbW9kZWwgZmlsZXM6CiAgICBtb2RlbF9kaXJlY3RvcnkgPSBvcy5wYXRoLmpvaW4ob3MucGF0aC5kaXJuYW1lKG1vZGVsX2ZpbGUpLCBfbW9kZWxfbmFtZSkKICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX2ZpbGU6CiAgICAgICAgemlwX2ZpbGUuZXh0cmFjdGFsbChtb2RlbF9kaXJlY3RvcnkpCgogICAgIyBMb2FkaW5nIHRoZSBzYXZlZCBwcmV0cmFpbmVkIHRva2VuaXplciBhbmQgbW9kZWw6CiAgICBkYXRhc2V0ID0gRGF0YXNldC5mcm9tX3BhbmRhcyhkYXRhKQogICAgdG9rZW5pemVyID0gQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQodG9rZW5pemVyX25hbWUpCiAgICBwYWRfdG9rZW5faWQgPSB0b2tlbml6ZXIuZW9zX3Rva2VuX2lkCiAgICBtb2RlbCA9IEF1dG9Nb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCgKICAgICAgICBtb2RlbF9uYW1lLCBkZXZpY2VfbWFwPSJjdWRhOjAiLCB0cnVzdF9yZW1vdGVfY29kZT1UcnVlLCBsb2FkX2luXzhiaXQ9VHJ1ZQogICAgKQogICAgbW9kZWwgPSBQZWZ0TW9kZWwuZnJvbV9wcmV0cmFpbmVkKG1vZGVsLCBtb2RlbF9kaXJlY3RvcnkpCiAgICBtb2RlbC5ldmFsKCkKICAgIGVuY29kaW5ncyA9IHRva2VuaXplcigiXG5cbiIuam9pbihkYXRhc2V0WyJ0ZXh0Il1bOjVdKSwgcmV0dXJuX3RlbnNvcnM9InB0IikKCiAgICBtYXhfbGVuZ3RoID0gMTAyNAogICAgc3RyaWRlID0gNTEyCiAgICBzZXFfbGVuID0gZW5jb2RpbmdzLmlucHV0X2lkcy5zaXplKDEpCgogICAgbmxscyA9IFtdCiAgICBwcmV2X2VuZF9sb2MgPSAwCiAgICBmb3IgYmVnaW5fbG9jIGluIHJhbmdlKDAsIHNlcV9sZW4sIHN0cmlkZSk6CiAgICAgICAgZW5kX2xvYyA9IG1pbihiZWdpbl9sb2MgKyBtYXhfbGVuZ3RoLCBzZXFfbGVuKQogICAgICAgIHRyZ19sZW4gPSBlbmRfbG9jIC0gcHJldl9lbmRfbG9jICAjIG1heSBiZSBkaWZmZXJlbnQgZnJvbSBzdHJpZGUgb24gbGFzdCBsb29wCiAgICAgICAgaW5wdXRfaWRzID0gZW5jb2RpbmdzLmlucHV0X2lkc1s6LCBiZWdpbl9sb2M6ZW5kX2xvY10KICAgICAgICB0YXJnZXRfaWRzID0gaW5wdXRfaWRzLmNsb25lKCkKICAgICAgICB0YXJnZXRfaWRzWzosIDotdHJnX2xlbl0gPSAtMTAwCgogICAgICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgICAgICBvdXRwdXRzID0gbW9kZWwoaW5wdXRfaWRzLmN1ZGEoKSwgbGFiZWxzPXRhcmdldF9pZHMpCgogICAgICAgICAgICAjIGxvc3MgaXMgY2FsY3VsYXRlZCB1c2luZyBDcm9zc0VudHJvcHlMb3NzIHdoaWNoIGF2ZXJhZ2VzIG92ZXIgdmFsaWQgbGFiZWxzCiAgICAgICAgICAgICMgTi5CLiB0aGUgbW9kZWwgb25seSBjYWxjdWxhdGVzIGxvc3Mgb3ZlciB0cmdfbGVuIC0gMSBsYWJlbHMsIGJlY2F1c2UgaXQgaW50ZXJuYWxseSBzaGlmdHMgdGhlIGxhYmVscwogICAgICAgICAgICAjIHRvIHRoZSBsZWZ0IGJ5IDEuCiAgICAgICAgICAgIG5lZ19sb2dfbGlrZWxpaG9vZCA9IG91dHB1dHMubG9zcwoKICAgICAgICBubGxzLmFwcGVuZChuZWdfbG9nX2xpa2VsaWhvb2QpCgogICAgICAgIHByZXZfZW5kX2xvYyA9IGVuZF9sb2MKICAgICAgICBpZiBlbmRfbG9jID09IHNlcV9sZW46CiAgICAgICAgICAgIGJyZWFrCgogICAgcHBsID0gdG9yY2guZXhwKHRvcmNoLnN0YWNrKG5sbHMpLm1lYW4oKSkuaXRlbSgpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoInBlcnBsZXhpdHkiLCBwcGwpCg== + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgoKaW1wb3J0IGltcG9ydGxpYgppbXBvcnQgb3MKaW1wb3J0IHNodXRpbAppbXBvcnQgdGVtcGZpbGUKaW1wb3J0IHppcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQwpmcm9tIHR5cGluZyBpbXBvcnQgRGljdCwgTGlzdCwgVHVwbGUsIFVuaW9uCgppbXBvcnQgbWxydW4KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHBlZnQKaW1wb3J0IHRvcmNoCmltcG9ydCB0cmFuc2Zvcm1lcnMKZnJvbSBkYXRhc2V0cyBpbXBvcnQgRGF0YXNldCwgbG9hZF9kYXRhc2V0CmZyb20gbWxydW4uYXJ0aWZhY3RzLm1hbmFnZXIgaW1wb3J0IEFydGlmYWN0LCBQbG90bHlBcnRpZmFjdApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgaXNfc3RvcmVfdXJpCmZyb20gbWxydW4uZnJhbWV3b3Jrcy5fY29tbW9uIGltcG9ydCBDb21tb25UeXBlcywgTUxSdW5JbnRlcmZhY2UKZnJvbSBtbHJ1bi51dGlscyBpbXBvcnQgbG9nZ2VyCmZyb20gdHJsIGltcG9ydCBEUE9UcmFpbmVyCmZyb20gcGVmdCBpbXBvcnQgTG9yYUNvbmZpZywgUGVmdE1vZGVsLCBnZXRfcGVmdF9tb2RlbCwgcHJlcGFyZV9tb2RlbF9mb3Jfa2JpdF90cmFpbmluZwpmcm9tIHBsb3RseSBpbXBvcnQgZ3JhcGhfb2JqZWN0cyBhcyBnbwpmcm9tIHRyYW5zZm9ybWVycyBpbXBvcnQgKAogICAgQXV0b01vZGVsRm9yQ2F1c2FsTE0sCiAgICBBdXRvVG9rZW5pemVyLAogICAgQml0c0FuZEJ5dGVzQ29uZmlnLAogICAgRGF0YUNvbGxhdG9yRm9yTGFuZ3VhZ2VNb2RlbGluZywKICAgIFByZVRyYWluZWRNb2RlbCwKICAgIFByZVRyYWluZWRUb2tlbml6ZXIsCiAgICBUcmFpbmVyQ2FsbGJhY2ssCiAgICBUcmFpbmVyQ29udHJvbCwKICAgIFRyYWluZXJTdGF0ZSwKICAgIFRyYWluaW5nQXJndW1lbnRzLAopCgoKY2xhc3MgQ29uZmlnS2V5czoKICAgIGRlZXBzcGVlZCA9ICJkZWVwc3BlZWQiCiAgICBxdWFudGl6YXRpb24gPSAicXVhbnRpemF0aW9uIgogICAgdHJhaW5pbmcgPSAidHJhaW5pbmciCiAgICB0b2tlbml6ZXJfcHJldHJhaW5lZCA9ICJ0b2tlbml6ZXJfcHJldHJhaW5lZCIKICAgIG1vZGVsX3ByZXRyYWluZWQgPSAibW9kZWxfcHJldHJhaW5lZCIKICAgIHBlZnRfY29uZmlnID0gInBlZnQiCiAgICBkYXRhX2NvbGxhdG9yID0gImRhdGFfY29sbGF0b3IiCiAgICBiZXRhID0gImJldGEiCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCmNsYXNzIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlKE1MUnVuSW50ZXJmYWNlLCBBQkMpOgogICAgIiIiCiAgICBUaGlzIGlzIHRlbXBvcmFyeSBhbmQgd2lsbCBiZSBidWlsdCBpbiBtbHJ1biAxLjUuMAogICAgSW50ZXJmYWNlIGZvciBhZGRpbmcgTUxSdW4gZmVhdHVyZXMgZm9yIHRlbnNvcmZsb3cga2VyYXMgQVBJLgogICAgIiIiCgogICAgIyBNTFJ1bnMgY29udGV4dCBkZWZhdWx0IG5hbWU6CiAgICBERUZBVUxUX0NPTlRFWFRfTkFNRSA9ICJtbHJ1bi1odWdnaW5nZmFjZSIKCiAgICAjIEF0dHJpYnV0ZXMgdG8gcmVwbGFjZSBzbyB0aGUgTUxSdW4gaW50ZXJmYWNlIHdpbGwgYmUgZnVsbHkgZW5hYmxlZC4KICAgIF9SRVBMQUNFRF9NRVRIT0RTID0gWwogICAgICAgICJ0cmFpbiIsCiAgICAgICAgIyAiZXZhbHVhdGUiCiAgICBdCgogICAgQGNsYXNzbWV0aG9kCiAgICBkZWYgYWRkX2ludGVyZmFjZSgKICAgICAgICBjbHMsCiAgICAgICAgb2JqOiBEUE9UcmFpbmVyLAogICAgICAgIHJlc3RvcmF0aW9uOiBDb21tb25UeXBlcy5NTFJ1bkludGVyZmFjZVJlc3RvcmF0aW9uVHlwZSA9IE5vbmUsCiAgICApOgogICAgICAgIHN1cGVyKEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLCBjbHMpLmFkZF9pbnRlcmZhY2UoCiAgICAgICAgICAgIG9iaj1vYmosIHJlc3RvcmF0aW9uPXJlc3RvcmF0aW9uCiAgICAgICAgKQoKICAgIEBjbGFzc21ldGhvZAogICAgZGVmIG1scnVuX3RyYWluKGNscyk6CiAgICAgICAgZGVmIHdyYXBwZXIoc2VsZjogRFBPVHJhaW5lciwgKmFyZ3MsICoqa3dhcmdzKToKICAgICAgICAgICAgIyBSZXN0b3JlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhcyBgdHJhaW5gIHdpbGwgdXNlIGl0OgogICAgICAgICAgICAjIGNscy5fcmVzdG9yZV9hdHRyaWJ1dGUob2JqPXNlbGYsIGF0dHJpYnV0ZV9uYW1lPSJldmFsdWF0ZSIpCgogICAgICAgICAgICAjIENhbGwgdGhlIG9yaWdpbmFsIGZpdCBtZXRob2Q6CiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYub3JpZ2luYWxfdHJhaW4oKmFyZ3MsICoqa3dhcmdzKQoKICAgICAgICAgICAgIyBSZXBsYWNlIHRoZSBldmFsdWF0aW9uIG1ldGhvZCBhZ2FpbjoKICAgICAgICAgICAgIyBjbHMuX3JlcGxhY2VfZnVuY3Rpb24ob2JqPXNlbGYsIGZ1bmN0aW9uX25hbWU9ImV2YWx1YXRlIikKCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKCiAgICAgICAgcmV0dXJuIHdyYXBwZXIKCgpjbGFzcyBNTFJ1bkNhbGxiYWNrKFRyYWluZXJDYWxsYmFjayk6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICBDYWxsYmFjayBmb3IgY29sbGVjdGluZyBsb2dzIGR1cmluZyB0cmFpbmluZyAvIGV2YWx1YXRpb24gb2YgdGhlIGBUcmFpbmVyYCBBUEkuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oCiAgICAgICAgc2VsZiwKICAgICAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICAgICAgbW9kZWxfbmFtZTogc3RyID0gIm1vZGVsIiwKICAgICAgICB0YWc6IHN0ciA9ICIiLAogICAgICAgIGxhYmVsczogRGljdFtzdHIsIHN0cl0gPSBOb25lLAogICAgICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKCkKCiAgICAgICAgIyBTdG9yZSB0aGUgY29uZmlndXJhdGlvbnM6CiAgICAgICAgc2VsZi5fY29udGV4dCA9ICgKICAgICAgICAgICAgY29udGV4dAogICAgICAgICAgICBpZiBjb250ZXh0IGlzIG5vdCBOb25lCiAgICAgICAgICAgIGVsc2UgbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoIi4vbWxydW4taHVnZ2luZ2ZhY2UiKQogICAgICAgICkKICAgICAgICBzZWxmLl9tb2RlbF9uYW1lID0gbW9kZWxfbmFtZQogICAgICAgIHNlbGYuX3RhZyA9IHRhZwogICAgICAgIHNlbGYuX2xhYmVscyA9IGxhYmVscwogICAgICAgIHNlbGYuX2V4dHJhX2RhdGEgPSBleHRyYV9kYXRhIGlmIGV4dHJhX2RhdGEgaXMgbm90IE5vbmUgZWxzZSB7fQoKICAgICAgICAjIFNldCB1cCB0aGUgbG9nZ2luZyBtb2RlOgogICAgICAgIHNlbGYuX2lzX3RyYWluaW5nID0gRmFsc2UKICAgICAgICBzZWxmLl9zdGVwczogTGlzdFtMaXN0W2ludF1dID0gW10KICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzOiBEaWN0W3N0ciwgTGlzdFtmbG9hdF1dID0ge30KICAgICAgICBzZWxmLl9hcnRpZmFjdHM6IERpY3Rbc3RyLCBBcnRpZmFjdF0gPSB7fQoKICAgIGRlZiBvbl9lcG9jaF9iZWdpbigKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHNlbGYuX3N0ZXBzLmFwcGVuZChbXSkKCiAgICBkZWYgb25fZXBvY2hfZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICAqKmt3YXJncywKICAgICk6CiAgICAgICAgaWYgbm90IHN0YXRlLmlzX3dvcmxkX3Byb2Nlc3NfemVybzoKICAgICAgICAgICAgcmV0dXJuCiAgICAgICAgc2VsZi5sb2dfbWV0cmljcygpCgogICAgZGVmIG9uX2xvZygKICAgICAgICBzZWxmLAogICAgICAgIGFyZ3M6IFRyYWluaW5nQXJndW1lbnRzLAogICAgICAgIHN0YXRlOiBUcmFpbmVyU3RhdGUsCiAgICAgICAgY29udHJvbDogVHJhaW5lckNvbnRyb2wsCiAgICAgICAgbG9nczogRGljdFtzdHIsIGZsb2F0XSA9IE5vbmUsCiAgICAgICAgKiprd2FyZ3MsCiAgICApOgogICAgICAgIGlmIG5vdCBzdGF0ZS5pc193b3JsZF9wcm9jZXNzX3plcm86CiAgICAgICAgICAgIHJldHVybgogICAgICAgIHJlY2VudF9sb2dzID0gc3RhdGUubG9nX2hpc3RvcnlbLTFdLmNvcHkoKQoKICAgICAgICByZWNlbnRfbG9ncy5wb3AoImVwb2NoIikKICAgICAgICBjdXJyZW50X3N0ZXAgPSBpbnQocmVjZW50X2xvZ3MucG9wKCJzdGVwIikpCiAgICAgICAgaWYgY3VycmVudF9zdGVwIG5vdCBpbiBzZWxmLl9zdGVwc1stMV06CiAgICAgICAgICAgIHNlbGYuX3N0ZXBzWy0xXS5hcHBlbmQoY3VycmVudF9zdGVwKQoKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZSBpbiByZWNlbnRfbG9ncy5pdGVtcygpOgogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZS5zdGFydHN3aXRoKCJ0cmFpbl8iKToKICAgICAgICAgICAgICAgIGlmIG1ldHJpY19uYW1lLnNwbGl0KCJ0cmFpbl8iKVsxXSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgICAgICBzZWxmLl9tZXRyaWNfc2NvcmVzW21ldHJpY19uYW1lXSA9IFttZXRyaWNfc2NvcmVdCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBpZiBtZXRyaWNfbmFtZSBub3QgaW4gc2VsZi5fbWV0cmljX3Njb3JlczoKICAgICAgICAgICAgICAgIHNlbGYuX21ldHJpY19zY29yZXNbbWV0cmljX25hbWVdID0gW10KICAgICAgICAgICAgc2VsZi5fbWV0cmljX3Njb3Jlc1ttZXRyaWNfbmFtZV0uYXBwZW5kKG1ldHJpY19zY29yZSkKCiAgICBkZWYgb25fdHJhaW5fYmVnaW4oCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLl9pc190cmFpbmluZyA9IFRydWUKCiAgICBkZWYgb25fdHJhaW5fZW5kKAogICAgICAgIHNlbGYsCiAgICAgICAgYXJnczogVHJhaW5pbmdBcmd1bWVudHMsCiAgICAgICAgc3RhdGU6IFRyYWluZXJTdGF0ZSwKICAgICAgICBjb250cm9sOiBUcmFpbmVyQ29udHJvbCwKICAgICAgICBtb2RlbDogUHJlVHJhaW5lZE1vZGVsID0gTm9uZSwKICAgICAgICB0b2tlbml6ZXI6IFByZVRyYWluZWRUb2tlbml6ZXIgPSBOb25lLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICBkZWYgb25fZXZhbHVhdGUoCiAgICAgICAgc2VsZiwKICAgICAgICBhcmdzOiBUcmFpbmluZ0FyZ3VtZW50cywKICAgICAgICBzdGF0ZTogVHJhaW5lclN0YXRlLAogICAgICAgIGNvbnRyb2w6IFRyYWluZXJDb250cm9sLAogICAgICAgICoqa3dhcmdzLAogICAgKToKICAgICAgICBpZiBub3Qgc3RhdGUuaXNfd29ybGRfcHJvY2Vzc196ZXJvOgogICAgICAgICAgICByZXR1cm4KICAgICAgICBzZWxmLmxvZ19tZXRyaWNzKCkKCiAgICAgICAgaWYgc2VsZi5faXNfdHJhaW5pbmc6CiAgICAgICAgICAgIHJldHVybgoKICAgIGRlZiBsb2dfbWV0cmljcyhzZWxmKToKICAgICAgICBmb3IgbWV0cmljX25hbWUsIG1ldHJpY19zY29yZXMgaW4gc2VsZi5fbWV0cmljX3Njb3Jlcy5pdGVtcygpOgogICAgICAgICAgICBzZWxmLl9jb250ZXh0LmxvZ19yZXN1bHQoa2V5PW1ldHJpY19uYW1lLCB2YWx1ZT1tZXRyaWNfc2NvcmVzWy0xXSkKICAgICAgICAgICAgaWYgbGVuKG1ldHJpY19zY29yZXMpID4gMToKICAgICAgICAgICAgICAgIHNlbGYubG9nX21ldHJpY19wbG90KG5hbWU9bWV0cmljX25hbWUsIHNjb3Jlcz1tZXRyaWNfc2NvcmVzKQogICAgICAgIHNlbGYuX2NvbnRleHQuY29tbWl0KGNvbXBsZXRlZD1GYWxzZSkKCiAgICBkZWYgbG9nX21ldHJpY19wbG90KHNlbGYsIG5hbWU6IHN0ciwgc2NvcmVzOiBMaXN0W2Zsb2F0XSk6CiAgICAgICAgIyBJbml0aWFsaXplIGEgcGxvdGx5IGZpZ3VyZToKICAgICAgICBtZXRyaWNfZmlndXJlID0gZ28uRmlndXJlKCkKCiAgICAgICAgIyBBZGQgdGl0bGVzOgogICAgICAgIG1ldHJpY19maWd1cmUudXBkYXRlX2xheW91dCgKICAgICAgICAgICAgdGl0bGU9bmFtZS5jYXBpdGFsaXplKCkucmVwbGFjZSgiXyIsICIgIiksCiAgICAgICAgICAgIHhheGlzX3RpdGxlPSJTYW1wbGVzIiwKICAgICAgICAgICAgeWF4aXNfdGl0bGU9IlNjb3JlcyIsCiAgICAgICAgKQoKICAgICAgICAjIERyYXc6CiAgICAgICAgbWV0cmljX2ZpZ3VyZS5hZGRfdHJhY2UoCiAgICAgICAgICAgIGdvLlNjYXR0ZXIoeD1ucC5hcmFuZ2UobGVuKHNjb3JlcykpLCB5PXNjb3JlcywgbW9kZT0ibGluZXMiKQogICAgICAgICkKCiAgICAgICAgIyBDcmVhdGUgdGhlIHBsb3RseSBhcnRpZmFjdDoKICAgICAgICBpZiAiLyIgaW4gbmFtZToKICAgICAgICAgICAgbmFtZSA9ICJfIi5qb2luKG5hbWUuc3BsaXQoIi8iKSkKICAgICAgICBhcnRpZmFjdF9uYW1lID0gZiJ7bmFtZX1fcGxvdCIKICAgICAgICBhcnRpZmFjdCA9IFBsb3RseUFydGlmYWN0KGtleT1hcnRpZmFjdF9uYW1lLCBmaWd1cmU9bWV0cmljX2ZpZ3VyZSkKICAgICAgICBzZWxmLl9hcnRpZmFjdHNbYXJ0aWZhY3RfbmFtZV0gPSBzZWxmLl9jb250ZXh0LmxvZ19hcnRpZmFjdChhcnRpZmFjdCkKCgpkZWYgYXBwbHlfbWxydW4oCiAgICB0cmFpbmVyOiBEUE9UcmFpbmVyLAogICAgbW9kZWxfbmFtZTogc3RyID0gTm9uZSwKICAgIHRhZzogc3RyID0gIiIsCiAgICBjb250ZXh0OiBtbHJ1bi5NTENsaWVudEN0eCA9IE5vbmUsCiAgICBhdXRvX2xvZzogYm9vbCA9IFRydWUsCiAgICBsYWJlbHM6IERpY3Rbc3RyLCBzdHJdID0gTm9uZSwKICAgIGV4dHJhX2RhdGE6IGRpY3QgPSBOb25lLAogICAgKiprd2FyZ3MsCik6CiAgICAiIiIKICAgIFRoaXMgaXMgdGVtcG9yYXJ5IGFuZCB3aWxsIGJlIGJ1aWx0IGluIG1scnVuIDEuNS4wCiAgICAiIiIKICAgICMgR2V0IHBhcmFtZXRlcnMgZGVmYXVsdHM6CiAgICBpZiBjb250ZXh0IGlzIE5vbmU6CiAgICAgICAgY29udGV4dCA9IG1scnVuLmdldF9vcl9jcmVhdGVfY3R4KEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLkRFRkFVTFRfQ09OVEVYVF9OQU1FKQoKICAgIEhGVHJhaW5lck1MUnVuSW50ZXJmYWNlLmFkZF9pbnRlcmZhY2Uob2JqPXRyYWluZXIpCgogICAgaWYgYXV0b19sb2c6CiAgICAgICAgdHJhaW5lci5hZGRfY2FsbGJhY2soCiAgICAgICAgICAgIE1MUnVuQ2FsbGJhY2soCiAgICAgICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgICAgICBtb2RlbF9uYW1lPW1vZGVsX25hbWUsCiAgICAgICAgICAgICAgICB0YWc9dGFnLAogICAgICAgICAgICAgICAgbGFiZWxzPWxhYmVscywKICAgICAgICAgICAgICAgIGV4dHJhX2RhdGE9ZXh0cmFfZGF0YSwKICAgICAgICAgICAgKQogICAgICAgICkKCgojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS1lbmQgZnJvbSBNTFJVTi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgoKZGVmIF9wcmludF90cmFpbmFibGVfcGFyYW1ldGVycyhtb2RlbCk6CiAgICAiIiIKICAgIFByaW50cyB0aGUgbnVtYmVyIG9mIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbC4KICAgICIiIgogICAgdHJhaW5hYmxlX3BhcmFtcyA9IDAKICAgIGFsbF9wYXJhbSA9IDAKICAgIGZvciBfLCBwYXJhbSBpbiBtb2RlbC5uYW1lZF9wYXJhbWV0ZXJzKCk6CiAgICAgICAgYWxsX3BhcmFtICs9IHBhcmFtLm51bWVsKCkKICAgICAgICBpZiBwYXJhbS5yZXF1aXJlc19ncmFkOgogICAgICAgICAgICB0cmFpbmFibGVfcGFyYW1zICs9IHBhcmFtLm51bWVsKCkKICAgIHByaW50KAogICAgICAgIGYidHJhaW5hYmxlIHBhcmFtczoge3RyYWluYWJsZV9wYXJhbXN9IHx8IGFsbCBwYXJhbXM6IHthbGxfcGFyYW19IHx8IHRyYWluYWJsZSU6IgogICAgICAgIGYiIHsxMDAgKiB0cmFpbmFibGVfcGFyYW1zIC8gYWxsX3BhcmFtfSIKICAgICkKCgojIGRlZmF1bHQgY29uZmlncwojIHdpbGwgYmUgdXNlZCBpZiB1c2VyIHByb3ZpZGVzICJUcnVlIiB3aXRoIGNvbmZpZyBuYW1lIGFzIGlucHV0ClFVQU5USVpBVElPTl9DT05GSUcgPSB0cmFuc2Zvcm1lcnMuQml0c0FuZEJ5dGVzQ29uZmlnKAogICAgbG9hZF9pbl80Yml0PVRydWUsCiAgICBibmJfNGJpdF91c2VfZG91YmxlX3F1YW50PVRydWUsCiAgICBibmJfNGJpdF9xdWFudF90eXBlPSJuZjQiLAogICAgYm5iXzRiaXRfY29tcHV0ZV9kdHlwZT10b3JjaC5iZmxvYXQxNiwKKQoKUEVGVF9DT05GSUcgPSBwZWZ0LkxvcmFDb25maWcoCiAgICByPTE2LAogICAgbG9yYV9hbHBoYT0xNiwKICAgIHRhcmdldF9tb2R1bGVzPVsKICAgICAgICAicV9wcm9qIiwKICAgICAgICAia19wcm9qIiwKICAgICAgICAidl9wcm9qIiwKICAgICAgICAib19wcm9qIiwKICAgICAgICAiZ2F0ZV9wcm9qIiwKICAgICAgICAidXBfcHJvaiIsCiAgICAgICAgImRvd25fcHJvaiIsCiAgICBdLAogICAgbG9yYV9kcm9wb3V0PTAuMDUsCiAgICBiaWFzPSJub25lIiwKICAgIHRhc2tfdHlwZT0iQ0FVU0FMX0xNIiwKKQoKREVFUFNQRUVEX0NPTkZJRyA9IHsKICAgICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiAiYXV0byIsCiAgICAiZnAxNiI6IHsiZW5hYmxlZCI6IFRydWV9LAogICAgImF1dG90dW5pbmciOiB7CiAgICAgICAgImVuYWJsZWQiOiBUcnVlLAogICAgICAgICJhcmdfbWFwcGluZ3MiOiB7CiAgICAgICAgICAgICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiAiLS1wZXJfZGV2aWNlX3RyYWluX2JhdGNoX3NpemUiLAogICAgICAgICAgICAiZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzICI6ICItLWdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyIsCiAgICAgICAgfSwKICAgIH0sCiAgICAiemVyb19vcHRpbWl6YXRpb24iOiB7CiAgICAgICAgInN0YWdlIjogMiwKICAgIH0sCn0KCgpkZWYgX3VwZGF0ZV9jb25maWcoc3JjOiBkaWN0LCBkc3Q6IGRpY3QpOgogICAgIiIiCiAgICB1cGRhdGUgY29uZmlncyBhY2NvcmRpbmcgdG8gdXNlciwgdGhpcyB3YXkgdGhlIHVzZXIgY2FuIGFkZC9tb2RpZnkgdmFsdWVzIGluIGRlZmF1bHQgY29uZmlncyBmb3IgZS5nLgoKICAgIGdvZXMgb3ZlciBhbGwgY29uZmlncyBhbmQgY29ycmVzcG9uZGluZyBwcmVmaXhlcywgY29sbGVjdCBhbGwgdGhlIGtleXMgZnJvbSB0aGUgZ2l2ZW4gZGljdCB0aGF0IHN0YXJ0CiAgICAgd2l0aCB0aGUgcHJlZml4IGFuZCBhZGQgdGhlbSB0byBhcHByb3ByaWF0ZSBjb25maWcKCiAgICA6cGFyYW0gc3JjOiBkaWN0IG9mIGFsbCBjYW5kaWRhdGUgdmFsdWVzIHRvIHVwZGF0ZSBkaWN0LgogICAgOnBhcmFtIGRzdDogZGljdCBjb250YWluaW5nIGFsbCBjb25maWdzIHRvIHVwZGF0ZS4KICAgICIiIgoKICAgIGZvciBjb25maWdfbmFtZSwgY29uZmlnIGluIGRzdC5pdGVtcygpOgoKICAgICAgICAjIElmIGdpdmVuIFRydWUgd2UgdXNlIGRlZmF1bHQgZGljdAogICAgICAgICMgQ2FuIGFsc28gYmUgRmFsc2Ugb3IgYSBjb25maWcgZGljdCBnaXZlbiBmcm9tIHVzZXIsIHNvIHdlIGNoZWNrIHNwZWNpZmljYWxseSBmbyBUcnVlCiAgICAgICAgaWYgY29uZmlnIGlzIFRydWUgYW5kIGNvbmZpZ19uYW1lID09ICJxdWFudGl6YXRpb24iOgogICAgICAgICAgICBjb25maWcgPSBRVUFOVElaQVRJT05fQ09ORklHCgogICAgICAgIGlmIGNvbmZpZyBpcyBUcnVlIGFuZCBjb25maWdfbmFtZSA9PSAicGVmdCI6CiAgICAgICAgICAgIGNvbmZpZyA9IFBFRlRfQ09ORklHCgogICAgICAgIGlmIGNvbmZpZyBpcyBUcnVlIGFuZCBjb25maWdfbmFtZSA9PSAiZGVlcHNwZWVkIjoKICAgICAgICAgICAgY29uZmlnID0gREVFUFNQRUVEX0NPTkZJRwoKICAgICAgICAjIGluIHNvbWUgY2FzZXMgd2UgY2FuIGdldCBhIGJvb2xlYW4gdmFsdWUsIGluIHRoYXQgY2FzZSBubyBuZWVkIHRvIGxvb2sgZm9yIGFyZ3MKICAgICAgICBpZiBpc2luc3RhbmNlKGNvbmZpZywgYm9vbCk6CiAgICAgICAgICAgIGNvbmZpZyA9IE5vbmUKCiAgICAgICAgZWxpZiBpc2luc3RhbmNlKGNvbmZpZywgZGljdCk6CiAgICAgICAgICAgIGZvciBrZXksIHZhbCBpbiBzcmMuaXRlbXMoKToKICAgICAgICAgICAgICAgIGlmIGtleS5zdGFydHN3aXRoKGNvbmZpZ19uYW1lKToKICAgICAgICAgICAgICAgICAgICBjb25maWdba2V5LnJlcGxhY2UoZiJ7Y29uZmlnX25hbWV9XyIsICIiKV0gPSB2YWwKCiAgICAgICAgIyB1cGRhdGUgYnkgY29uZmlnIG5hbWUKICAgICAgICBlbHNlOgogICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gc3JjLml0ZW1zKCk6CiAgICAgICAgICAgICAgICBpZiBrZXkuc3RhcnRzd2l0aChjb25maWdfbmFtZSk6CiAgICAgICAgICAgICAgICAgICAgc2V0YXR0cihjb25maWcsIGtleS5yZXBsYWNlKGYie2NvbmZpZ19uYW1lfV8iLCAiIiksIHZhbCkKCiAgICAgICAgZHN0LnVwZGF0ZSh7Y29uZmlnX25hbWU6IGNvbmZpZ30pCgoKZGVmIF9nZXRfY2xhc3Nfb2JqZWN0KGNsYXNzX3BhdGg6IHN0cikgLT4gdHlwZToKICAgICIiIgogICAgZ2l2ZW4gYSBmdWxsIGNsYXNzIG5hbWUsIHRoaXMgZnVuY3Rpb24gcmV0dXJucyB0aGUgY29ycmVjdCBjbGFzcwoKICAgIDpwYXJhbSBjbGFzc19wYXRoOiBhIGZ1bGwgY2xhc3MgbmFtZSAoZXguICd0cmFuc2Zvcm1lcnMuQXV0b01vZGVsRm9yQ2F1c2FsTE0nKQoKICAgIDpyZXR1cm4gdGhlIHdhbnRlZCBjbGFzcyBvYmplY3QKICAgICIiIgogICAgbW9kdWxlX3BhdGgsIGNsYXNzX25hbWUgPSBjbGFzc19wYXRoLnJzcGxpdCgiLiIsIDEpCiAgICBtb2R1bGUgPSBpbXBvcnRsaWIuaW1wb3J0X21vZHVsZShtb2R1bGVfcGF0aCkKICAgIHJldHVybiBnZXRhdHRyKG1vZHVsZSwgY2xhc3NfbmFtZSkKCgpkZWYgX3NldF9tb2RlbF9hbmRfdG9rZW5pemVyKAogICAgbW9kZWw6IFVuaW9uW3N0ciwgTGlzdFtzdHJdXSwKICAgIHRva2VuaXplcjogVW5pb25bc3RyLCBMaXN0W3N0cl1dLAogICAgdGFzazogc3RyLAogICAgZnJhbWV3b3JrOiBzdHIsCiAgICBxdWFudGl6YXRpb25fY29uZmlnOiBkaWN0LAogICAgdXNlX2N1ZGE6IGJvb2wsCiAgICB0b2tlbml6ZXJfcHJldHJhaW5lZF9jb25maWcsCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZywKICAgIGRldmljZV9tYXA6IHN0ciwKKToKICAgICIiIgogICAgZ2V0IHRoZSBjb3JyZWN0IG1vZGVsIGFuZCB0b2tlbml6ZXIgYWNjb3JkaW5nIHRvIGdpdmVuIHVzZXIgaW5wdXRzCgogICAgOnBhcmFtIG1vZGVsOiBhIHR1cGxlIGNvbnRhaW5pbmcgbW9kZWwgbmFtZSBhbmQgY2xhc3MsIG9yIHN0ciB3aXRoIG1vZGVsIG5hbWUgb3IgcGF0aAogICAgOnBhcmFtIHRva2VuaXplcjogYSB0dXBsZSBjb250YWluaW5nIHRva2VuaXplciBuYW1lIGFuZCBjbGFzcywgb3Igc3RyIHdpdGggdG9rZW5pemVyIG5hbWUgb3IgcGF0aAogICAgOnBhcmFtIHRhc2s6IGEgc3VwcG9ydGVkIG5scCB0YXNrLCB1c2VkIHRvIGNob29zZSBtb2RlbCBpZiBub3QgcHJvdmlkZWQKICAgIDpwYXJhbSBmcmFtZXdvcms6IHB0IG9yIHRmCiAgICA6cGFyYW0gcXVhbnRpemF0aW9uX2NvbmZpZzogcXVhbnRpemF0aW9uIGNvbmZpZyBvciBOb25lLCB0byBsb2FkIG1vZGVsIGluIGFwcHJvcHJpYXRlIHdheQogICAgOnBhcmFtIHVzZV9jdWRhOiB1c2UgZ3B1IG9yIG5vdAogICAgOnBhcmFtIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZzogY29uZmlnIHRvIGxvYWQgdGhlIHByZXRyYWluZWQgdG9rZW5pemVyCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gZGV2aWNlX21hcDogYSBkZXZpY2UgbWFwIGZvciBtb2RlbCB0cmFpbmluZyBpZiB1c2luZyBudW1iZXIgb2YgZ3B1J3MKCiAgICA6cmV0dXJuczogbW9kZWwgYW5kIHRva2VuaXplcgogICAgIiIiCiAgICAjIGxvYWQgbW9kZWwgZnJvbSBzdG9yZQogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgc3RyKSBhbmQgaXNfc3RvcmVfdXJpKG1vZGVsKToKICAgICAgICBwYXNzCiAgICAgICAgIyBUT0RPOiBsb2FkIGJvdGggbW9kZWwgYW5kIHRva2VuaXplciBhbmQgcmV0dXJuLCBuZWVkIGd1eSdzIGhlbHAKCiAgICAjIGlmIGl0J3MgYSB0dXBsZSB0aGVtIHdlIGFzc3VtZSBpdCBjb250YWlucyBvZiBib3RoIG5hbWUgYW5kIGNsYXNzCiAgICBpZiBpc2luc3RhbmNlKG1vZGVsLCBsaXN0KToKICAgICAgICBtb2RlbF9uYW1lLCBtb2RlbF9jbGFzcyA9IG1vZGVsCiAgICAgICAgbW9kZWxfY2xhc3MgPSBfZ2V0X2NsYXNzX29iamVjdChtb2RlbF9jbGFzcykKCiAgICAjIGluIHRoZSBjYXNlIHdlIGRvbid0IGdldCB0aGUgbW9kZWwgY2xhc3Mgd2UgbmVlZCB0aGUgdGFzayBpbiBvcmRlciB0byBjaG9vc2UgdGhlIGNvcnJlY3QgbW9kZWwKICAgIGVsc2U6CiAgICAgICAgaWYgdGFzayBpcyBOb25lOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoInRhc2sgbXVzdCBiZSBjaG9zZW4gaW4gb3JkZXIgdG8gZGV0ZXJtaW5lIHRoZSBjb3JyZWN0IG1vZGVsIikKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKAogICAgICAgICAgICAgICAgInRoaXMgZnVuY3Rpb24gcmVxdWlyZXMgZWl0aGVyIGEgc3VwcG9ydGVkIHRhc2sgb3IgYSBtb2RlbCBhbmQgbW9kZWwgY2xhc3MgdG8gYmUgY2hvc2VuIgogICAgICAgICAgICApCgogICAgICAgIF8sIGF2YWlsYWJsZV9jbGFzc2VzLCB0YXNrX29wdGlvbnMgPSB0cmFuc2Zvcm1lcnMucGlwZWxpbmVzLmNoZWNrX3Rhc2sodGFzaykKCiAgICAgICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgc3RyKToKICAgICAgICAgICAgbW9kZWxfbmFtZSA9IG1vZGVsCgogICAgICAgICMgaWYgbW9kZWwgaXMgbm90IGdpdmVuLCB3ZSB0YWtlIHRoZSBkZWZhdWx0IG1vZGVsIGZvciB0aGUgZ2l2ZW4gdGFzawogICAgICAgIGVsc2U6CiAgICAgICAgICAgIG1vZGVsX25hbWUsIF8gPSB0cmFuc2Zvcm1lcnMucGlwZWxpbmVzLmdldF9kZWZhdWx0X21vZGVsX2FuZF9yZXZpc2lvbigKICAgICAgICAgICAgICAgIGF2YWlsYWJsZV9jbGFzc2VzLCBmcmFtZXdvcmssIHRhc2tfb3B0aW9ucwogICAgICAgICAgICApCiAgICAgICAgaWYgbm90IGF2YWlsYWJsZV9jbGFzc2VzLmdldChmcmFtZXdvcmssIHR1cGxlKCkpOgogICAgICAgICAgICBsb2dnZXIuZXJyb3IoCiAgICAgICAgICAgICAgICAiZ2l2ZW4gdGFzaydzIGRlZmF1bHQgbW9kZWwgaXMgbm90IHN1cHBvcnRlZCBpbiBzcGVjaWZpZWQgZnJhbWV3b3JrIgogICAgICAgICAgICApCiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigKICAgICAgICAgICAgICAgICJ0aGlzIGZ1bmN0aW9uIHJlcXVpcmVzIGVpdGhlciBhIHN1cHBvcnRlZCB0YXNrIG9yIGEgbW9kZWwgYW5kIG1vZGVsIGNsYXNzIHRvIGJlIGNob3NlbiIKICAgICAgICAgICAgKQoKICAgICAgICBtb2RlbF9jbGFzcyA9IGF2YWlsYWJsZV9jbGFzc2VzW2ZyYW1ld29ya11bMF0KCiAgICAjIGxvYWQgdGhlIHByZXRyYWluZWQgbW9kZWwKICAgIGlmIHVzZV9jdWRhOgogICAgICAgIGRldmljZV9tYXAgPSBkZXZpY2VfbWFwCiAgICBlbHNlOgogICAgICAgIGRldmljZV9tYXAgPSBOb25lCgogICAgbW9kZWwgPSBtb2RlbF9jbGFzcy5mcm9tX3ByZXRyYWluZWQoCiAgICAgICAgbW9kZWxfbmFtZSwKICAgICAgICBxdWFudGl6YXRpb25fY29uZmlnPXF1YW50aXphdGlvbl9jb25maWcsCiAgICAgICAgZGV2aWNlX21hcD1kZXZpY2VfbWFwLAogICAgICAgICoqbW9kZWxfcHJldHJhaW5lZF9jb25maWcsCiAgICApCgogICAgIyBJZiBxdWFudGl6YXRpb24gY29uZmlnIGlzIGdpdmVuIHdlIHdpbGwgbG9hZCBhIHF1YW50aXplZCBtb2RlbCwgaWYgbm90IGEgcmVndWxhciBvbmUKICAgIGlmIHF1YW50aXphdGlvbl9jb25maWc6CiAgICAgICAgbW9kZWwuZ3JhZGllbnRfY2hlY2twb2ludGluZ19lbmFibGUoKQogICAgICAgIG1vZGVsID0gcGVmdC5wcmVwYXJlX21vZGVsX2Zvcl9rYml0X3RyYWluaW5nKG1vZGVsKQoKICAgICMgaWYgbm90IHNwZWNpZmllZCB3ZSBjaG9vc2UgdGhlIGRlZmF1bHQgdG9rZW5pemVyIHRoYXQgY29ycmVzcG9uZGluZyB0byB0aGUgbW9kZWwKICAgIGlmIHRva2VuaXplciBpcyBOb25lOgogICAgICAgIHRva2VuaXplciA9IHRyYW5zZm9ybWVycy5BdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChtb2RlbF9uYW1lKQogICAgICAgIHJldHVybiBtb2RlbF9uYW1lLCBtb2RlbCwgdG9rZW5pemVyCgogICAgaWYgaXNpbnN0YW5jZSh0b2tlbml6ZXIsIHN0cik6CiAgICAgICAgdG9rZW5pemVyX25hbWUgPSB0b2tlbml6ZXIKICAgICAgICB0b2tlbml6ZXJfY2xhc3MgPSB0cmFuc2Zvcm1lcnMuQXV0b1Rva2VuaXplcgoKICAgICMgaWYgaXQncyBub3QgYSBzdHIgdGhlbiBpdCdzIGEgdHVwbGUgb2YgYm90aCBuYW1lIGFuZCBjbGFzcwogICAgZWxzZToKICAgICAgICB0b2tlbml6ZXJfbmFtZSwgdG9rZW5pemVyX2NsYXNzID0gdG9rZW5pemVyCiAgICAgICAgdG9rZW5pemVyX2NsYXNzID0gX2dldF9jbGFzc19vYmplY3QodG9rZW5pemVyX2NsYXNzKQoKICAgIHRva2VuaXplciA9IHRva2VuaXplcl9jbGFzcy5mcm9tX3ByZXRyYWluZWQoCiAgICAgICAgdG9rZW5pemVyX25hbWUsICoqdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnCiAgICApCgogICAgdG9rZW5pemVyLnBhZF90b2tlbiA9IHRva2VuaXplci5lb3NfdG9rZW4KCiAgICByZXR1cm4gbW9kZWxfbmFtZSwgbW9kZWwsIHRva2VuaXplcgoKCmRlZiBfZGF0YXNldF9sb2FkZXIoZGF0YXNldDogc3RyLCBpc190cmFpbjogYm9vbCA9IFRydWUsICoqa3dhcmdzKSAtPiBEYXRhc2V0OgogICAgIiIiCiAgICBsb2FkcyB0aGUgc3BlY2lmaWMgZGF0YXNldCBwcm92aWRlZCBieSB0aGUgdXNlcgoKICAgIDpwYXJhbSBkYXRhc2V0OiBuYW1lIG9yIHBhdGggb2YgZGF0YXNldCB0byBsb2FkCiAgICA6cGFyYW0gaXNfdHJhaW46IGJvb2wgdGhhdCBpbmRpY2F0ZXMgdGhlIHB1cnBvc2Ugb2YgdGhlIGRhdGFzZXQKICAgIDpwYXJhbSBrd2FyZ3M6IG90aGVyIGt3YXJncyBmb3IgbG9hZGluZyB0aGUgZGF0YXNldAoKICAgIDpyZXR1cm5zOiBsb2FkZWQgZGF0YXNldAogICAgIiIiCiAgICAjIGlmIHNwbGl0IGluIGt3YXJncyB0aGVuIHRoZSB1c2VyIGRlY2lkZXMgaG93IHRvIHNwbGl0IHRoZSBkYXRhc2V0CiAgICBpZiAic3BsaXQiIGluIGt3YXJnczoKICAgICAgICByZXR1cm4gbG9hZF9kYXRhc2V0KGRhdGFzZXQsICoqa3dhcmdzKQoKICAgICMgaWYgaXQncyBhIGRhdGFzZXQgZm9yIHRyYWluIHdlIHNwbGl0IHdpdGggdHJhaW4KICAgIGlmIGlzX3RyYWluOgogICAgICAgIHJldHVybiBsb2FkX2RhdGFzZXQoZGF0YXNldCwgc3BsaXQ9InRyYWluIiwgKiprd2FyZ3MpCgogICAgIyBpZiBpdCdzIGV2YWwgZGF0YXNldCwgdGhlbiBhIGxvdCBvZiBuYW1lcyBhcmUgYWNjZXB0YWJsZSBmb3IgdGhlIHNldCBhbmQgd2UgY2hlY2sgYWxsIG9mIHRoZW0KICAgIGRhdGFzZXQgPSBsb2FkX2RhdGFzZXQoZGF0YXNldCwgKiprd2FyZ3MpCiAgICBpZiAidGVzdCIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoInRlc3QiKQogICAgZWxpZiAiZXZhbCIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoImV2YWwiKQogICAgZWxpZiAidmFsaWRhdGlvbiIgaW4gZGF0YXNldDoKICAgICAgICByZXR1cm4gZGF0YXNldC5nZXQoInZhbGlkYXRpb24iKQogICAgcmV0dXJuIGRhdGFzZXQKCgpkZWYgX3ByZXBhcmVfZGF0YXNldCgKICAgIHRyYWluX2RhdGFzZXQ6IHN0ciwKICAgIGV2YWxfZGF0YXNldDogc3RyLAogICAgdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncywKICAgIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJncywKKSAtPiAoRGF0YXNldCwgVW5pb25bRGF0YXNldCwgTm9uZV0pOgogICAgIiIiCiAgICBMb2FkcyB0aGUgdHJhaW4gYW5kIGV2YWwgZGF0YXNldHMgKGlmIHByb3ZpZGVkKSBwYXNzZXMgdGhlbSB0aHJvdWdoIHRoZSB0b2tlbml6ZXIgYW5kCiAgICByZXR1cm5zIHRoZW0gcmVhZHkgdG8gdXNlIGluIHRyYWluaW5nCgogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIHRyYWluIGRhdGFzZXQKICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IHRoZSBuYW1lIG9yIHBhdGggdG8gdGhlIGV2YWwgZGF0YXNldAogICAgOnBhcmFtIHRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGt3YXJncyBmb3IgZGF0YXNldCBsb2FkaW5nCiAgICA6cGFyYW0gZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwoKICAgIDpyZXR1cm5zOiB0b2tlbml6ZWQgZGF0YXNldHMKICAgICIiIgoKICAgICMgTG9hZCBkYXRhc2V0cwogICAgIyBpZiBwcm92aWRlZCB0d28gcGF0aHMvbmFtZXMgd2UgbG9hZCBlYWNoIHNlcGFyYXRlbHkgdXNpbmcgZGVzaWduYXRlZCBmdW5jCiAgICBpZiBldmFsX2RhdGFzZXQ6CiAgICAgICAgdHJhaW5fZGF0YXNldCA9IF9kYXRhc2V0X2xvYWRlcigKICAgICAgICAgICAgZGF0YXNldD10cmFpbl9kYXRhc2V0LCBpc190cmFpbj1UcnVlLCAqKnRyYWluX2xvYWRfZGF0YXNldF9rd2FyZ3MKICAgICAgICApCiAgICAgICAgZXZhbF9kYXRhc2V0ID0gX2RhdGFzZXRfbG9hZGVyKAogICAgICAgICAgICBkYXRhc2V0PWV2YWxfZGF0YXNldCwgaXNfdHJhaW49RmFsc2UsICoqZXZhbF9sb2FkX2RhdGFzZXRfa3dhcmdzCiAgICAgICAgKQogICAgIyBpZiBvbmx5IG9uIHBhdGggaXMgZ2l2ZW4gdGhlbiB3ZSBtdXN0IGNoZWNrIGlmIGl0IGNvbnRhaW5zIGJvdGggZGF0YXNldCBvciBpZiBvbmx5IG9uZSBzaG91bGQgYmUgdXNlZAogICAgZWxzZToKICAgICAgICBkYXRhc2V0ID0gbG9hZF9kYXRhc2V0KHRyYWluX2RhdGFzZXQsICoqdHJhaW5fbG9hZF9kYXRhc2V0X2t3YXJncykKICAgICAgICBpZiAidHJhaW4iIGluIGRhdGFzZXQ6CiAgICAgICAgICAgIHRyYWluX2RhdGFzZXQgPSBkYXRhc2V0LmdldCgidHJhaW4iKQogICAgICAgICAgICBpZiAidGVzdCIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ0ZXN0IikKICAgICAgICAgICAgZWxpZiAiZXZhbCIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJldmFsIikKICAgICAgICAgICAgZWxpZiAidmFsaWRhdGlvbiIgaW4gZGF0YXNldDoKICAgICAgICAgICAgICAgIGV2YWxfZGF0YXNldCA9IGRhdGFzZXQuZ2V0KCJ2YWxpZGF0aW9uIikKICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJldHVybiB0cmFpbl9kYXRhc2V0CiAgICAgICAgZWxzZToKICAgICAgICAgICAgbG9nZ2VyLmVycm9yKCJ0cmFpbiBkYXRhc2V0IGlzIG1hbmRhdG9yeSIpCiAgICAgICAgICAgIHJhaXNlIEtleUVycm9yKCJubyB0cmFpbiBkYXRhc2V0IGZvdW5kIGluIGdpdmVuIGRhdGFzZXQiKQoKICAgIHJldHVybiB0cmFpbl9kYXRhc2V0LCBldmFsX2RhdGFzZXQKCgpkZWYgZHBvX3RyYWluKAogICAgY29udGV4dDogbWxydW4uTUxDbGllbnRDdHgsCiAgICB0cmFpbl9kYXRhc2V0OiBVbmlvbltzdHIsIG1scnVuLmRhdGFzdG9yZS5EYXRhSXRlbV0sCiAgICBldmFsX2RhdGFzZXQ6IHN0ciA9IE5vbmUsCiAgICB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBkaWN0ID0ge30sCiAgICBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3M6IGRpY3QgPSB7fSwKICAgIG1vZGVsOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSAiaHVnZ2luZ2ZhY2UtbW9kZWwiLAogICAgdG9rZW5pemVyOiBVbmlvbltzdHIsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgZGVlcHNwZWVkX2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIHF1YW50aXphdGlvbl9jb25maWc6IFVuaW9uW2RpY3QsIGJvb2xdID0gRmFsc2UsCiAgICBwZWZ0X2NvbmZpZzogVW5pb25bZGljdCwgYm9vbF0gPSBGYWxzZSwKICAgIGJldGE6IFVuaW9uW2Zsb2F0LCBib29sXSA9IEZhbHNlLAogICAgdHJhaW5pbmdfY29uZmlnOiBkaWN0ID0ge30sCiAgICBtb2RlbF9wcmV0cmFpbmVkX2NvbmZpZzogZGljdCA9IHt9LAogICAgdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBkaWN0ID0ge30sCiAgICBkYXRhX2NvbGxhdG9yX2NvbmZpZzogZGljdCA9IHt9LAogICAgdGFzazogc3RyID0gInRleHQtZ2VuZXJhdGlvbiIsCiAgICB1c2VfY3VkYTogYm9vbCA9IFRydWUsCiAgICBmcmFtZXdvcms6IHN0ciA9ICJwdCIsCiAgICBkZXZpY2VfbWFwOiBzdHIgPSAiYXV0byIsCiAgICAqKmt3YXJncywKKToKICAgICIiIgogICAgRm9ybSBhIGRwbyB0cmFpbmluZyBqb2IgdG8gZG8gbGxtIGFsaWdubWVudAogICAgIFRoZSBmdW5jdGlvbiB0YWtlcyB2YXJpb3VzIGNvbmZpZ3VyYXRpb24gcGFyYW1ldGVycyB0byBjdXN0b21pemUgdGhlIHRyYWluaW5nIHByb2Nlc3MKICAgICBhbmQgYWRhcHQgdGhlIG1vZGVsIHRvIHNwZWNpZmljIHRhc2tzIHVzaW5nIGEgcHJvdmlkZWQgZGF0YXNldC4KCiAgICA6cGFyYW0gY29udGV4dDogbWxydW4gY29udGV4dCBpbiBvcmRlciB0byBsb2cgdHJhaW5lZCBtb2RlbAogICAgOnBhcmFtIHRyYWluX2RhdGFzZXQ6IFRoZSB0cmFpbiBkYXRhc2V0IHVzZWQgZm9yIGZpbmUtdHVuaW5nIHRoZSBsYW5ndWFnZSBtb2RlbC4KICAgIDpwYXJhbSBldmFsX2RhdGFzZXQ6IFRoZSBldmFsIGRhdGFzZXQgdXNlZCBmb3IgZXZhbHVhdGUgdGhlIGxhbmd1YWdlIG1vZGVsIGR1cmluZyB0cmFpbmluZy4KICAgIDpwYXJhbSB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzOiBrd2FyZ3MgZm9yIGRhdGFzZXQgbG9hZGluZwogICAgOnBhcmFtIGV2YWxfbG9hZF9kYXRhc2V0X2t3YXJnczoga3dhcmdzIGZvciBkYXRhc2V0IGxvYWRpbmcKICAgIDpwYXJhbSBtb2RlbDogYSB0dXBsZSBjb250YWluaW5nIG1vZGVsIG5hbWUgYW5kIGNsYXNzLCBvciBzdHIgd2l0aCBtb2RlbCBuYW1lIG9yIHBhdGgKICAgIDpwYXJhbSB0b2tlbml6ZXI6IGEgdHVwbGUgY29udGFpbmluZyB0b2tlbml6ZXIgbmFtZSBhbmQgY2xhc3MsIG9yIHN0ciB3aXRoIHRva2VuaXplciBuYW1lIG9yIHBhdGgKICAgIDpwYXJhbSBkZWVwc3BlZWRfY29uZmlnOiBDb25maWd1cmF0aW9uIG9wdGlvbnMgZm9yIERlZXBTcGVlZCAob3B0aW9uYWwpLgogICAgOnBhcmFtIHF1YW50aXphdGlvbl9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgbW9kZWwgcXVhbnRpemF0aW9uIChvcHRpb25hbCkuCiAgICA6cGFyYW0gcGVmdF9jb25maWc6IENvbmZpZ3VyYXRpb24gb3B0aW9ucyBmb3IgTG93LVJhbmsgQXBwcm94aW1hdGlvbiAoTG9SQSkgKG9wdGlvbmFsKS4KICAgIDpwYXJhbSBiZXRhOiBzdXBlciBwYXJhbWV0ZXIgb2YgS0wgZGl2ZXJnZW5jZQogICAgOnBhcmFtIHRyYWluaW5nX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIHNwZWNpZmljIHRvIHRoZSBmaW5lLXR1bmluZyB0cmFpbmluZyBwcm9jZXNzIChvcHRpb25hbCkuCiAgICA6cGFyYW0gbW9kZWxfcHJldHJhaW5lZF9jb25maWc6IGNvbmZpZyB0byBsb2FkIHRoZSBwcmV0cmFpbmVkIG1vZGVsCiAgICA6cGFyYW0gdG9rZW5pemVyX3ByZXRyYWluZWRfY29uZmlnOiBjb25maWcgdG8gbG9hZCB0aGUgcHJldHJhaW5lZCB0b2tlbml6ZXIKICAgIDpwYXJhbSBkYXRhX2NvbGxhdG9yX2NvbmZpZzogQ29uZmlndXJhdGlvbiBvcHRpb25zIGZvciBkYXRhIGNvbGxhdGlvbiBkdXJpbmcgdHJhaW5pbmcgKG9wdGlvbmFsKS4KICAgIDpwYXJhbSB0YXNrOiBBIGRlc2NyaXB0aW9uIG9mIHRoZSBzcGVjaWZpYyB0YXNrIHRoZSBtb2RlbCBpcyBiZWluZyBmaW5lLXR1bmVkIGZvci4KICAgIDpwYXJhbSB1c2VfY3VkYTogdXNlIGdwdSBvciBub3QKICAgIDpwYXJhbSBmcmFtZXdvcms6IHB0IG90IHRmCiAgICA6cGFyYW0ga3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzLgogICAgIiIiCgogICAgIyBMb29rIGZvciB1cGRhdGVzIHRvIGNvbmZpZ3MgZ2l2ZW4gaW4ga3dhcmdzCiAgICBjb25maWdzID0gewogICAgICAgIENvbmZpZ0tleXMuZGVlcHNwZWVkOiBkZWVwc3BlZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMucXVhbnRpemF0aW9uOiBxdWFudGl6YXRpb25fY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMudHJhaW5pbmc6IHRyYWluaW5nX2NvbmZpZywKICAgICAgICBDb25maWdLZXlzLm1vZGVsX3ByZXRyYWluZWQ6IG1vZGVsX3ByZXRyYWluZWRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMudG9rZW5pemVyX3ByZXRyYWluZWQ6IHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZywKICAgICAgICBDb25maWdLZXlzLmRhdGFfY29sbGF0b3I6IGRhdGFfY29sbGF0b3JfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMucGVmdF9jb25maWc6IHBlZnRfY29uZmlnLAogICAgICAgIENvbmZpZ0tleXMuYmV0YTogYmV0YSwKICAgIH0KICAgIF91cGRhdGVfY29uZmlnKGRzdD1jb25maWdzLCBzcmM9a3dhcmdzKQoKICAgICMgY2hlY2sgZ3B1IHBlcm1pc3Npb24gYW5kIGF2YWlsYWJpbGl0eQogICAgaWYgdXNlX2N1ZGE6CiAgICAgICAgaWYgdG9yY2guY3VkYS5pc19hdmFpbGFibGUoKToKICAgICAgICAgICAgIyBDbGVhbiBncHUgY2FjaGUKICAgICAgICAgICAgdG9yY2guY3VkYS5lbXB0eV9jYWNoZSgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgbG9nZ2VyLndhcm5pbmcoIid1c2VfY3VkYScgaXMgc2V0IHRvIFRydWUsIGJ1dCBubyBjdWRhIGRldmljZSBpcyBhdmFpbGFibGUiKQoKICAgICMgZ2V0IG1vZGVsIGFuZCB0b2tlbml6ZXIKICAgIG1vZGVsX25hbWUsIG1vZGVsLCB0b2tlbml6ZXIgPSBfc2V0X21vZGVsX2FuZF90b2tlbml6ZXIoCiAgICAgICAgbW9kZWw9bW9kZWwsCiAgICAgICAgdG9rZW5pemVyPXRva2VuaXplciwKICAgICAgICBmcmFtZXdvcms9ZnJhbWV3b3JrLAogICAgICAgIHRhc2s9dGFzaywKICAgICAgICBxdWFudGl6YXRpb25fY29uZmlnPWNvbmZpZ3NbQ29uZmlnS2V5cy5xdWFudGl6YXRpb25dLAogICAgICAgIHVzZV9jdWRhPXVzZV9jdWRhLAogICAgICAgIHRva2VuaXplcl9wcmV0cmFpbmVkX2NvbmZpZz10b2tlbml6ZXJfcHJldHJhaW5lZF9jb25maWcsCiAgICAgICAgbW9kZWxfcHJldHJhaW5lZF9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLm1vZGVsX3ByZXRyYWluZWRdLAogICAgICAgIGRldmljZV9tYXA9ZGV2aWNlX21hcCwKICAgICkKICAgIHRyYWluX2RhdGFzZXQsIGV2YWxfZGF0YXNldCA9IF9wcmVwYXJlX2RhdGFzZXQoCiAgICAgICAgdHJhaW5fZGF0YXNldCwgZXZhbF9kYXRhc2V0LCB0cmFpbl9sb2FkX2RhdGFzZXRfa3dhcmdzLCBldmFsX2xvYWRfZGF0YXNldF9rd2FyZ3MKICAgICkKCiAgICAjIEluaXRpYWxpemUgdHJhaW5pbmcga3dhcmdzIGZyb20gdXNlciBrd2FyZ3M6CiAgICB0cmFpbl9rd2FyZ3MgPSBjb25maWdzW0NvbmZpZ0tleXMudHJhaW5pbmddCgogICAgIyBJZiBkZWVwc3BlZWQgY29uZmlnIGdpdmVuIHdlIGFkZCBpdCB0byB0cmFpbmluZyBrd2FyZ3MKICAgIGlmIGNvbmZpZ3NbQ29uZmlnS2V5cy5kZWVwc3BlZWRdOgogICAgICAgIHRyYWluX2t3YXJnc1siZGVlcHNwZWVkIl0gPSBjb25maWdzW0NvbmZpZ0tleXMuZGVlcHNwZWVkXQoKICAgICMgVGFrZSBhIGxvb2sgYXQgdGhlIHRyYWluYWJsZSBwYXJhbWV0ZXJzIGluIHRoZSBtb2RlbAogICAgX3ByaW50X3RyYWluYWJsZV9wYXJhbWV0ZXJzKG1vZGVsKQoKICAgICMgUHJlcGFyaW5nIHRyYWluaW5nIGFyZ3VtZW50czoKICAgIHRyYWluaW5nX2FyZ3MgPSB0cmFuc2Zvcm1lcnMuVHJhaW5pbmdBcmd1bWVudHMoCiAgICAgICAgb3V0cHV0X2Rpcj10ZW1wZmlsZS5ta2R0ZW1wKCksCiAgICAgICAgKip0cmFpbl9rd2FyZ3MsCiAgICApCgogICAgdHJhaW5lciA9IERQT1RyYWluZXIoCiAgICAgICAgbW9kZWw9bW9kZWwsCiAgICAgICAgcmVmX21vZGVsPU5vbmUsCiAgICAgICAgdHJhaW5fZGF0YXNldD10cmFpbl9kYXRhc2V0LAogICAgICAgIGV2YWxfZGF0YXNldD1ldmFsX2RhdGFzZXQsCiAgICAgICAgcGVmdF9jb25maWc9Y29uZmlnc1tDb25maWdLZXlzLnBlZnRfY29uZmlnXSwKICAgICAgICBiZXRhPWNvbmZpZ3NbQ29uZmlnS2V5cy5iZXRhXSwKICAgICAgICB0b2tlbml6ZXI9dG9rZW5pemVyLAogICAgICAgIGFyZ3M9dHJhaW5pbmdfYXJncywKICAgICAgICBtYXhfbGVuZ3RoPTIwNDgsCiAgICAgICAgbWF4X3Byb21wdF9sZW5ndGg9NDA5NiwKICAgICkKCiAgICBhcHBseV9tbHJ1bih0cmFpbmVyLCBtb2RlbF9uYW1lPW1vZGVsX25hbWUuc3BsaXQoIi8iKVstMV0pCiAgICBtb2RlbC5jb25maWcudXNlX2NhY2hlID0gKAogICAgICAgIEZhbHNlICAjIHNpbGVuY2UgdGhlIHdhcm5pbmdzLiBQbGVhc2UgcmUtZW5hYmxlIGZvciBpbmZlcmVuY2UhCiAgICApCgogICAgIyBBcHBseSB0cmFpbmluZyB3aXRoIGV2YWx1YXRpb246CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYidHJhaW5pbmcgJ3ttb2RlbF9uYW1lfSciKQogICAgdHJhaW5lci50cmFpbigpCgogICAgdGVtcF9kaXJlY3RvcnkgPSB0ZW1wZmlsZS5UZW1wb3JhcnlEaXJlY3RvcnkoKS5uYW1lCiAgICB0cmFpbmVyLnNhdmVfbW9kZWwodGVtcF9kaXJlY3RvcnkpCgogICAgIyBaaXAgdGhlIG1vZGVsIGRpcmVjdG9yeToKICAgIHNodXRpbC5tYWtlX2FyY2hpdmUoCiAgICAgICAgYmFzZV9uYW1lPSJtb2RlbCIsCiAgICAgICAgZm9ybWF0PSJ6aXAiLAogICAgICAgIHJvb3RfZGlyPXRlbXBfZGlyZWN0b3J5LAogICAgKQoKICAgICMgTG9nIHRoZSBtb2RlbDoKICAgIGNvbnRleHQubG9nX21vZGVsKAogICAgICAgIGtleT0ibW9kZWwiLAogICAgICAgIGRiX2tleT1tb2RlbF9uYW1lLnNwbGl0KCIvIilbLTFdLAogICAgICAgIG1vZGVsX2ZpbGU9Im1vZGVsLnppcCIsCiAgICAgICAgdGFnPSIiLAogICAgICAgIGZyYW1ld29yaz0iSHVnZ2luZyBGYWNlIiwKICAgICkKCgpkZWYgZXZhbHVhdGUoCiAgICBjb250ZXh0LAogICAgbW9kZWxfcGF0aCwKICAgIGRhdGE6IHBkLkRhdGFGcmFtZSwKICAgIG1vZGVsX25hbWU6IHN0ciA9IE5vbmUsCiAgICB0b2tlbml6ZXJfbmFtZTogc3RyID0gTm9uZSwKKToKICAgICIiIgogICAgRXZhbHVhdGluZyB0aGUgbW9kZWwgdXNpbmcgcGVycGxleGl0eSwgZm9yIG1vcmUgaW5mb3JtYXRpb24gdmlzaXQ6CiAgICBodHRwczovL2h1Z2dpbmdmYWNlLmNvL2RvY3MvdHJhbnNmb3JtZXJzL3BlcnBsZXhpdHkKCiAgICA6cGFyYW0gY29udGV4dDogICAgIG1scnVuIGNvbnRleHQKICAgIDpwYXJhbSBtb2RlbF9wYXRoOiAgcGF0aCB0byB0aGUgbW9kZWwgZGlyZWN0b3J5CiAgICA6cGFyYW0gZGF0YTogICAgICAgIHRoZSBkYXRhIHRvIGV2YWx1YXRlIHRoZSBtb2RlbAogICAgOnBhcmFtIG1vZGVsX25hbWU6ICBuYW1lIG9mIGJhc2UgbW9kZWwKICAgIDpwYXJhbSB0b2tlbml6ZXJfbmFtZTogbmFtZSBvZiBiYXNlIHRva2VuaXplcgogICAgIiIiCiAgICAjIEdldCB0aGUgbW9kZWwgYXJ0aWZhY3QgYW5kIGZpbGU6CiAgICAoCiAgICAgICAgbW9kZWxfZmlsZSwKICAgICAgICBtb2RlbF9hcnRpZmFjdCwKICAgICAgICBleHRyYV9kYXRhLAogICAgKSA9IG1scnVuLmFydGlmYWN0cy5nZXRfbW9kZWwobW9kZWxfcGF0aCkKCiAgICAjIFJlYWQgdGhlIG5hbWU6CiAgICBfbW9kZWxfbmFtZSA9IG1vZGVsX2FydGlmYWN0LnNwZWMuZGJfa2V5CgogICAgIyBFeHRyYWN0IGxvZ2dlZCBtb2RlbCBmaWxlczoKICAgIG1vZGVsX2RpcmVjdG9yeSA9IG9zLnBhdGguam9pbihvcy5wYXRoLmRpcm5hbWUobW9kZWxfZmlsZSksIF9tb2RlbF9uYW1lKQogICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfZmlsZToKICAgICAgICB6aXBfZmlsZS5leHRyYWN0YWxsKG1vZGVsX2RpcmVjdG9yeSkKCiAgICAjIExvYWRpbmcgdGhlIHNhdmVkIHByZXRyYWluZWQgdG9rZW5pemVyIGFuZCBtb2RlbDoKICAgIGRhdGFzZXQgPSBEYXRhc2V0LmZyb21fcGFuZGFzKGRhdGEpCiAgICB0b2tlbml6ZXIgPSBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCh0b2tlbml6ZXJfbmFtZSkKICAgIHBhZF90b2tlbl9pZCA9IHRva2VuaXplci5lb3NfdG9rZW5faWQKICAgIG1vZGVsID0gQXV0b01vZGVsRm9yQ2F1c2FsTE0uZnJvbV9wcmV0cmFpbmVkKAogICAgICAgIG1vZGVsX25hbWUsIGRldmljZV9tYXA9ImN1ZGE6MCIsIHRydXN0X3JlbW90ZV9jb2RlPVRydWUsIGxvYWRfaW5fOGJpdD1UcnVlCiAgICApCiAgICBtb2RlbCA9IFBlZnRNb2RlbC5mcm9tX3ByZXRyYWluZWQobW9kZWwsIG1vZGVsX2RpcmVjdG9yeSkKICAgIG1vZGVsLmV2YWwoKQogICAgZW5jb2RpbmdzID0gdG9rZW5pemVyKCJcblxuIi5qb2luKGRhdGFzZXRbInRleHQiXVs6NV0pLCByZXR1cm5fdGVuc29ycz0icHQiKQoKICAgIG1heF9sZW5ndGggPSAxMDI0CiAgICBzdHJpZGUgPSA1MTIKICAgIHNlcV9sZW4gPSBlbmNvZGluZ3MuaW5wdXRfaWRzLnNpemUoMSkKCiAgICBubGxzID0gW10KICAgIHByZXZfZW5kX2xvYyA9IDAKICAgIGZvciBiZWdpbl9sb2MgaW4gcmFuZ2UoMCwgc2VxX2xlbiwgc3RyaWRlKToKICAgICAgICBlbmRfbG9jID0gbWluKGJlZ2luX2xvYyArIG1heF9sZW5ndGgsIHNlcV9sZW4pCiAgICAgICAgdHJnX2xlbiA9IGVuZF9sb2MgLSBwcmV2X2VuZF9sb2MgICMgbWF5IGJlIGRpZmZlcmVudCBmcm9tIHN0cmlkZSBvbiBsYXN0IGxvb3AKICAgICAgICBpbnB1dF9pZHMgPSBlbmNvZGluZ3MuaW5wdXRfaWRzWzosIGJlZ2luX2xvYzplbmRfbG9jXQogICAgICAgIHRhcmdldF9pZHMgPSBpbnB1dF9pZHMuY2xvbmUoKQogICAgICAgIHRhcmdldF9pZHNbOiwgOi10cmdfbGVuXSA9IC0xMDAKCiAgICAgICAgd2l0aCB0b3JjaC5ub19ncmFkKCk6CiAgICAgICAgICAgIG91dHB1dHMgPSBtb2RlbChpbnB1dF9pZHMuY3VkYSgpLCBsYWJlbHM9dGFyZ2V0X2lkcykKCiAgICAgICAgICAgICMgbG9zcyBpcyBjYWxjdWxhdGVkIHVzaW5nIENyb3NzRW50cm9weUxvc3Mgd2hpY2ggYXZlcmFnZXMgb3ZlciB2YWxpZCBsYWJlbHMKICAgICAgICAgICAgIyBOLkIuIHRoZSBtb2RlbCBvbmx5IGNhbGN1bGF0ZXMgbG9zcyBvdmVyIHRyZ19sZW4gLSAxIGxhYmVscywgYmVjYXVzZSBpdCBpbnRlcm5hbGx5IHNoaWZ0cyB0aGUgbGFiZWxzCiAgICAgICAgICAgICMgdG8gdGhlIGxlZnQgYnkgMS4KICAgICAgICAgICAgbmVnX2xvZ19saWtlbGlob29kID0gb3V0cHV0cy5sb3NzCgogICAgICAgIG5sbHMuYXBwZW5kKG5lZ19sb2dfbGlrZWxpaG9vZCkKCiAgICAgICAgcHJldl9lbmRfbG9jID0gZW5kX2xvYwogICAgICAgIGlmIGVuZF9sb2MgPT0gc2VxX2xlbjoKICAgICAgICAgICAgYnJlYWsKCiAgICBwcGwgPSB0b3JjaC5leHAodG9yY2guc3RhY2sobmxscykubWVhbigpKS5pdGVtKCkKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgicGVycGxleGl0eSIsIHBwbCkK commands: [] code_origin: '' origin_filename: '' @@ -34,7 +34,7 @@ spec: default: null outputs: - default: '' - lineno: 72 + lineno: 79 mlrun_train: name: mlrun_train doc: '' @@ -43,7 +43,7 @@ spec: default: '' outputs: - default: '' - lineno: 82 + lineno: 89 wrapper: name: wrapper doc: '' @@ -53,7 +53,7 @@ spec: default: '' outputs: - default: '' - lineno: 83 + lineno: 90 on_epoch_begin: name: on_epoch_begin doc: '' @@ -71,7 +71,7 @@ spec: default: '' outputs: - default: '' - lineno: 131 + lineno: 138 on_epoch_end: name: on_epoch_end doc: '' @@ -89,7 +89,7 @@ spec: default: '' outputs: - default: '' - lineno: 142 + lineno: 149 on_log: name: on_log doc: '' @@ -110,7 +110,7 @@ spec: default: null outputs: - default: '' - lineno: 153 + lineno: 160 on_train_begin: name: on_train_begin doc: '' @@ -128,7 +128,7 @@ spec: default: '' outputs: - default: '' - lineno: 179 + lineno: 186 on_train_end: name: on_train_end doc: '' @@ -152,7 +152,7 @@ spec: default: null outputs: - default: '' - lineno: 190 + lineno: 197 on_evaluate: name: on_evaluate doc: '' @@ -170,7 +170,7 @@ spec: default: '' outputs: - default: '' - lineno: 203 + lineno: 210 log_metrics: name: log_metrics doc: '' @@ -179,7 +179,7 @@ spec: default: '' outputs: - default: '' - lineno: 217 + lineno: 224 log_metric_plot: name: log_metric_plot doc: '' @@ -194,7 +194,7 @@ spec: default: '' outputs: - default: '' - lineno: 224 + lineno: 231 apply_mlrun: name: apply_mlrun doc: This is temporary and will be built in mlrun 1.5.0 @@ -222,13 +222,12 @@ spec: default: null outputs: - default: '' - lineno: 246 + lineno: 255 dpo_train: name: dpo_train - doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ - \ dataset.\n The function takes various configuration parameters to customize\ - \ the training process\n and adapt the model to specific tasks using a provided\ - \ dataset." + doc: "Form a dpo training job to do llm alignment\n The function takes various\ + \ configuration parameters to customize the training process\n and adapt the\ + \ model to specific tasks using a provided dataset." parameters: - name: context type: MLClientCtx @@ -250,10 +249,6 @@ spec: type: dict doc: kwargs for dataset loading default: {} - - name: dataset_columns_to_train - type: Union[str, list] - doc: which columns to pass to the model as inputs - default: text - name: model type: Union[str, List[str]] doc: a tuple containing model name and class, or str with model name or path @@ -273,9 +268,11 @@ spec: default: false - name: peft_config type: Union[dict, bool] + doc: Configuration options for Low-Rank Approximation (LoRA) (optional). default: false - name: beta type: Union[float, bool] + doc: super parameter of KL divergence default: false - name: training_config type: dict @@ -310,7 +307,7 @@ spec: default: auto outputs: - default: '' - lineno: 627 + lineno: 583 evaluate: name: evaluate doc: 'Evaluating the model using perplexity, for more information visit: @@ -337,7 +334,7 @@ spec: default: null outputs: - default: '' - lineno: 785 + lineno: 726 description: doing the alignment with dpo trainer default_handler: dpo_train disable_auto_mount: false From bbc2fa2001f7b7eb39a845526fc882eb985c6e88 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:13:45 +0000 Subject: [PATCH 28/33] update the test case --- huggingface_dpo/test_huggingface_dpo_trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index f073aafb5..63f3f50c8 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -31,15 +31,15 @@ def test_dpo_fn(): "do_eval": False, "optim": "paged_adamw_8bit", "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 1, "per_device_eval_batch_size": 1, "log_level": "info", - "save_steps": 5, + "save_steps": 1, "learning_rate": 5e-7, "eval_steps": 1, "num_train_epochs": 1, - "max_steps": 5, - "warmup_steps": 5, + "max_steps": 1, + "warmup_steps": 1, "fp16": True, "lr_scheduler_type": "cosine", "remove_unused_columns": True, From c781ecf4f1aec77f2d9a692330975ba3171b0f8e Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 05:17:14 +0000 Subject: [PATCH 29/33] passed the test case --- .../test_huggingface_dpo_trainer.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 63f3f50c8..1aa31707e 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -21,7 +21,6 @@ def test_dpo_fn(): model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name - # dop_trainer = mlrun.import_function("function.yaml") ctx = mlrun.get_or_create_ctx(name="test_dpo") train_dataset = "unalignment/toxic-dpo-v0.2" @@ -56,43 +55,3 @@ def test_dpo_fn(): use_cuda=True, beta=0.1, ) - - -def test_dpo_train(): - - model_name = "mistralai/Mistral-7B-Instruct-v0.2" - tokenizer = model_name - dop_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "ref_model": None, - "tokenizer": tokenizer, - "train_dataset": "Abirate/english_quotes", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - dpo_trainer.run( - local=True, - params=params, - handler="dpo_train", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") From 2f5361e72827d84c70dd240a5857b6b9e8459785 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:26:38 +0000 Subject: [PATCH 30/33] adding the function yaml to the test case --- .../test_huggingface_dpo_trainer.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 1aa31707e..98783c644 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -13,12 +13,13 @@ # limitations under the License. import tempfile -from huggingface_dpo_trainer import dpo_train +# from huggingface_dpo_trainer import dpo_train import mlrun def test_dpo_fn(): + dpo_trainer = mlrun.import_function("function.yaml") model_name = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = model_name @@ -44,14 +45,24 @@ def test_dpo_fn(): "remove_unused_columns": True, "gradient_checkpointing": True, } - dpo_train( - context=ctx, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - peft_config=True, - model=model_name, - tokenizer=tokenizer, - training_config=training_arguments, - use_cuda=True, - beta=0.1, - ) + params = { + "model": model_name, + "tokenizer": tokenizer, + "train_dataset": train_dataset, + "eval_dataset": eval_dataset, + "peft_config": True, + "training_config": training_arguments, + "use_cuda": True, + "beta": 0.1, + } + try: + with tempfile.TemporaryDirectory() as test_directory: + dpo_trainer.run( + local=True, + params=params, + handler="dpo_train", + returns=["model"], + workdir=test_directory, + ) + except Exception as exception: + print(f"-The training failed -raised the following error: \n -{exception}") From d63b755d4e84f58184529cb8a81f9344d07278c5 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:30:47 +0000 Subject: [PATCH 31/33] should be good for the notebook --- huggingface_dpo/test_huggingface_dpo_trainer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/huggingface_dpo/test_huggingface_dpo_trainer.py b/huggingface_dpo/test_huggingface_dpo_trainer.py index 98783c644..db289b51e 100644 --- a/huggingface_dpo/test_huggingface_dpo_trainer.py +++ b/huggingface_dpo/test_huggingface_dpo_trainer.py @@ -13,8 +13,6 @@ # limitations under the License. import tempfile - -# from huggingface_dpo_trainer import dpo_train import mlrun From 5d1ccc444d89cd041c4ffe2d06da384f5bdf1507 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:54:43 +0000 Subject: [PATCH 32/33] adding the notebook and raise the PR --- huggingface_dpo/huggingface_dpo_trainer.ipynb | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 huggingface_dpo/huggingface_dpo_trainer.ipynb diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb new file mode 100644 index 000000000..b0b0f60ae --- /dev/null +++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d", + "metadata": {}, + "source": [ + "# DPO trainer for llm alignment" + ] + }, + { + "cell_type": "markdown", + "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b", + "metadata": {}, + "source": [ + "## Notebook Introduction: Doing the llm alignment with DPO trainer\n", + "\n", + "In this notebook, we will walk you through a step-by-step process of how to do alignment for a SOTA llm with DPO method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness." + ] + }, + { + "cell_type": "markdown", + "id": "425249e9-f43f-45e6-aa25-9f53099049cd", + "metadata": {}, + "source": [ + "### First, we will select the model we wish to align and take the matching tokenizer and appropriate config" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n", + "from transformers import logging\n", + "\n", + "logging.set_verbosity(\"CRITICAL\")\n", + "\n", + "model_name = \"mistralai/Mistral-7B-Instruct-v0.2\"\n", + "tokenizer = model_name\n", + "generation_config = GenerationConfig.from_pretrained(model_name)" + ] + }, + { + "cell_type": "markdown", + "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230", + "metadata": {}, + "source": [ + "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:49:17,440 [info] Project loaded successfully: {'project_name': 'dpo-trainer-test'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "\n", + "project = mlrun.get_or_create_project(\n", + " name=\"dpo-trainer-test\",\n", + " context=\"./\",\n", + " user_project=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d56b834f-adf6-4736-8de7-3348e050f561", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.set_function(\n", + " \"huggingface_dpo_trainer.py\",\n", + " name=\"dpo-trainer\",\n", + " kind=\"local\",\n", + " handler=\"dpo_train\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47", + "metadata": {}, + "source": [ + "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b", + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n", + "eval_dataset = \"reciprocate/ultrafeedback_cleaned_high_dpo\"\n", + "training_arguments = {\n", + " \"evaluation_strategy\": \"steps\",\n", + " \"do_eval\": True,\n", + " \"optim\": \"paged_adamw_8bit\",\n", + " \"per_device_train_batch_size\": 1,\n", + " \"gradient_accumulation_steps\": 1,\n", + " \"per_device_eval_batch_size\": 1,\n", + " \"log_level\": \"info\",\n", + " \"save_steps\": 1,\n", + " \"learning_rate\": 5e-7,\n", + " \"eval_steps\": 1,\n", + " \"num_train_epochs\": 1,\n", + " \"max_steps\": 1,\n", + " \"warmup_steps\": 1,\n", + " \"fp16\": True,\n", + " \"lr_scheduler_type\": \"cosine\",\n", + " \"remove_unused_columns\": True,\n", + " \"gradient_checkpointing\": True,\n", + "}\n", + "params = {\n", + " \"model\": model_name,\n", + " \"tokenizer\": tokenizer,\n", + " \"train_dataset\": train_dataset,\n", + " \"eval_dataset\": eval_dataset,\n", + " \"peft_config\": True,\n", + " \"training_config\": training_arguments,\n", + " \"use_cuda\": True,\n", + " \"beta\": 0.1,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4", + "metadata": {}, + "source": [ + "### Now we simply run the function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11ab5888-5870-4bf8-9657-db930adecd77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:49:20,738 [info] Storing function: {'name': 'dpo-trainer', 'uid': 'b4ed0d2bdc8c4e44892aee1a3549969d', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3a28ff59fc674c4aac2e2ee2d1bf0211", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00 2024-04-01 16:49:40,542 [info] training 'mistralai/Mistral-7B-Instruct-v0.2'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running training *****\n", + " Num examples = 541\n", + " Num Epochs = 1\n", + " Instantaneous batch size per device = 1\n", + " Total train batch size (w. parallel, distributed & accumulation) = 1\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 1\n", + " Number of trainable parameters = 41,943,040\n", + "torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n", + "None of the inputs have requires_grad=True. Gradients will be None\n", + "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n", + "***** Running Evaluation *****\n", + " Num examples = 541\n", + " Batch size = 1\n" + ] + } + ], + "source": [ + "training_run = mlrun.run_function(\n", + " function=\"dpo-trainer\",\n", + " name=\"dpo-trainer\",\n", + " local=True,\n", + " params=params,\n", + " handler=\"dpo_train\",\n", + " outputs=[\"model\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dpo", + "language": "python", + "name": "conda-env-.conda-dpo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From bf66dfd8944d3532431a3e09687e4310c72bd3f0 Mon Sep 17 00:00:00 2001 From: peng wei Date: Mon, 1 Apr 2024 16:57:34 +0000 Subject: [PATCH 33/33] raise the PR --- huggingface_dpo/huggingface_dpo_trainer.ipynb | 322 +++++++++++++++++- 1 file changed, 320 insertions(+), 2 deletions(-) diff --git a/huggingface_dpo/huggingface_dpo_trainer.ipynb b/huggingface_dpo/huggingface_dpo_trainer.ipynb index b0b0f60ae..07dfcf024 100644 --- a/huggingface_dpo/huggingface_dpo_trainer.ipynb +++ b/huggingface_dpo/huggingface_dpo_trainer.ipynb @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "11ab5888-5870-4bf8-9657-db930adecd77", "metadata": {}, "outputs": [ @@ -229,7 +229,325 @@ "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n", "***** Running Evaluation *****\n", " Num examples = 541\n", - " Batch size = 1\n" + " Batch size = 1\n", + "Saving model checkpoint to /tmp/tmp1k687jql/tmp-checkpoint-1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'eval_train_loss': 0.6931472420692444, 'eval_train_runtime': 365.1876, 'eval_train_samples_per_second': 1.481, 'eval_train_steps_per_second': 1.481, 'eval_rewards/chosen': 0.0, 'eval_rewards/rejected': 0.0, 'eval_rewards/accuracies': 0.0, 'eval_rewards/margins': 0.0, 'eval_logps/rejected': -127.08296203613281, 'eval_logps/chosen': -328.57867431640625, 'eval_logits/rejected': -2.3305602073669434, 'eval_logits/chosen': -2.911039113998413, 'epoch': 0.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n", + "Model config MistralConfig {\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.38.2\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "tokenizer config file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/tokenizer_config.json\n", + "Special tokens file saved in /tmp/tmp1k687jql/tmp-checkpoint-1/special_tokens_map.json\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Saving model checkpoint to /tmp/tmpe5yijcu0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'train_runtime': 367.9669, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'train_loss': 0.6931471824645996, 'epoch': 0.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /igz/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json\n", + "Model config MistralConfig {\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.38.2\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "tokenizer config file saved in /tmp/tmpe5yijcu0/tokenizer_config.json\n", + "Special tokens file saved in /tmp/tmpe5yijcu0/special_tokens_map.json\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
dpo-trainer-test-pengwei0Apr 01 16:49:20completeddpo-trainer
v3io_user=pengwei
kind=local
owner=pengwei
host=jupyter-pengwei-gpu-86c58c8f79-8ls8j
model=mistralai/Mistral-7B-Instruct-v0.2
tokenizer=mistralai/Mistral-7B-Instruct-v0.2
train_dataset=unalignment/toxic-dpo-v0.2
eval_dataset=unalignment/toxic-dpo-v0.2
peft_config=True
training_config={'evaluation_strategy': 'steps', 'do_eval': False, 'optim': 'paged_adamw_8bit', 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 1, 'per_device_eval_batch_size': 1, 'log_level': 'info', 'save_steps': 1, 'learning_rate': 5e-07, 'eval_steps': 1, 'num_train_epochs': 1, 'max_steps': 1, 'warmup_steps': 1, 'fp16': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': True, 'gradient_checkpointing': True}
use_cuda=True
beta=0.1
eval_train_loss=0.6931472420692444
eval_train_runtime=365.1876
eval_train_samples_per_second=1.481
eval_train_steps_per_second=1.481
eval_rewards/chosen=0.0
eval_rewards/rejected=0.0
eval_rewards/accuracies=0.0
eval_rewards/margins=0.0
eval_logps/rejected=-127.08296203613281
eval_logps/chosen=-328.57867431640625
eval_logits/rejected=-2.3305602073669434
eval_logits/chosen=-2.911039113998413
train_runtime=367.9669
train_samples_per_second=0.003
train_steps_per_second=0.003
total_flos=0.0
train_loss=0.6931471824645996
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-04-01 16:55:57,867 [info] Run execution finished: {'status': 'completed', 'name': 'dpo-trainer'}\n" ] } ],