From 70dfa5d0c4f43beec508cfd24fb1351f3c4e2727 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Thu, 13 Jul 2023 18:19:23 -0500 Subject: [PATCH 01/16] :recycle: Refactor trainer logic and move it to resources Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 67 ++++++++----------- caikit_nlp/resources/pretrained_model/base.py | 37 +++++++++- .../pretrained_model/hf_auto_seq2seq_lm.py | 44 +++++++++++- 3 files changed, 105 insertions(+), 43 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 05b1c075..ca726fa0 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -15,14 +15,7 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import ( - AutoConfig, - AutoTokenizer, - DataCollatorForSeq2Seq, - Seq2SeqTrainer, - Seq2SeqTrainingArguments, - Trainer, -) +from transformers import AutoConfig, AutoTokenizer, Trainer # First Party from caikit.core.data_model import DataStream @@ -32,6 +25,7 @@ # Local from ...data_model import GeneratedResult, GenerationTrainRecord +from ...resources.pretrained_model.base import PretrainedModelBase from ...toolkit.data_stream_wrapper import SimpleIterableStreamWrapper from ...toolkit.data_type_utils import get_torch_dtype from .text_generation_task import TextGenerationTask @@ -79,6 +73,7 @@ def train( lr: float = 2e-5, # Directory where model predictions and checkpoints will be written checkpoint_dir: str = "/tmp", + **training_arguments ): """ # FIXME: Below is currently configured for Seq2Seq only @@ -110,6 +105,7 @@ def train( log.debug("Bootstrapping base resource [%s]", base_model) base_model = resource_type.bootstrap(base_model, torch_dtype=torch_dtype) + error.type_check("", PretrainedModelBase, base_model=base_model) ## Generate data loader from stream training_dataset: IterableDataset = cls._preprocess_function( train_stream=train_stream, @@ -125,40 +121,33 @@ def train( # by optionally accepting `training_args` # as argument to this train function. # TODO: Remove all the default used below and make them all configurable - training_args = Seq2SeqTrainingArguments( - output_dir=checkpoint_dir, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - num_train_epochs=num_epochs, + + training_args = { + "output_dir": checkpoint_dir, + "per_device_train_batch_size": batch_size, + "per_device_eval_batch_size": batch_size, + "num_train_epochs": num_epochs, # NOTE: We have disabled evaluation for now - do_eval=False, - # evaluation_strategy = "epoch", - learning_rate=lr, - weight_decay=0.01, - save_total_limit=3, - predict_with_generate=True, - fp16=True, - push_to_hub=False, - no_cuda=False, # Default - generation_max_length=max_target_length, - remove_unused_columns=False, - dataloader_pin_memory=False, - gradient_accumulation_steps=accumulate_steps, - eval_accumulation_steps=accumulate_steps, + "do_eval": False, + "# evaluation_strategy ": "epoch", + "learning_rate": lr, + "weight_decay": 0.01, + "save_total_limit": 3, + "predict_with_generate": True, + "fp16": True, + "push_to_hub": False, + "no_cuda": False, # Default + "generation_max_length": max_target_length, + "remove_unused_columns": False, + "dataloader_pin_memory": False, + "gradient_accumulation_steps": accumulate_steps, + "eval_accumulation_steps": accumulate_steps, # eval_steps=1, - ) + **training_arguments, + } - data_collator = DataCollatorForSeq2Seq( - tokenizer=base_model.tokenizer, model=base_model.model - ) - - trainer = Seq2SeqTrainer( - base_model.model, - training_args, - train_dataset=training_dataset, - data_collator=data_collator, - tokenizer=base_model.tokenizer, - # compute_metrics=compute_metrics, + trainer = base_model.get_trainer( + train_dataset=training_dataset, **training_args ) # Start training via Trainer.train function diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index 079a6f83..3c9e26a7 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -14,12 +14,13 @@ # Standard from abc import ABC, abstractmethod -from typing import List, Optional, Type +from typing import List, Optional, Type, Union import json import os # Third Party -from transformers import AutoTokenizer +from torch.utils.data import IterableDataset +from transformers import AutoTokenizer, DataCollator, Trainer, TrainingArguments from transformers.models.auto.auto_factory import _BaseAutoModelClass import torch @@ -233,6 +234,38 @@ def save( self.tokenizer.save_pretrained(tok_abs_path) self.model.save_pretrained(model_abs_path) + def get_trainer( + self, + train_dataset: IterableDataset, + eval_dataset: Union[IterableDataset, None] = None, + optimizers=(None, None), + **kwargs, + ): + """ + NOTE: following parameters are not supported currently: + 1. model_init + 2. compute_metrics + 3. callbacks + 4. preprocess_logits_for_metrics + """ + + training_args = TrainingArguments(**kwargs) + + # TODO: Fetch DataCollator either from property of this + # class or fetch it as an argument. + data_collator = DataCollator(tokenizer=self._tokenizer, model=self._model) + + # pylint: disable=duplicate-code + trainer_arguments = { + "train_dataset": train_dataset, + "data_collator": data_collator, + "tokenizer": self._tokenizer, + "optimizers": optimizers, + "eval_dataset": eval_dataset, + } + + return Trainer(self._model, training_args, **trainer_arguments) + # pylint: disable=unused-argument @classmethod def get_num_transformers_submodules( diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index d0627708..1a06cd9b 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -15,10 +15,16 @@ Huggingface auto causal LM resource type """ # Standard -from typing import List +from typing import List, Union # Third Party -from transformers import AutoModelForSeq2SeqLM +from torch.utils.data import IterableDataset +from transformers import ( + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, +) from transformers.models.auto import modeling_auto # First Party @@ -64,3 +70,37 @@ def get_num_transformers_submodules( "", 0 < num_transformer_submodules <= cls.MAX_NUM_TRANSFORMERS ) return num_transformer_submodules + + def get_trainer( + self, + train_dataset: IterableDataset, + eval_dataset: Union[IterableDataset, None] = None, + optimizers=(None, None), + **kwargs + ): + """ + NOTE: following parameters are not supported currently: + 1. model_init + 2. compute_metrics + 3. callbacks + 4. preprocess_logits_for_metrics + """ + + training_args = Seq2SeqTrainingArguments(**kwargs) + + # TODO: Fetch DataCollator either from property of this + # class or fetch it as an argument. + data_collator = DataCollatorForSeq2Seq( + tokenizer=self._tokenizer, model=self._model + ) + + # pylint: disable=duplicate-code + trainer_arguments = { + "train_dataset": train_dataset, + "data_collator": data_collator, + "tokenizer": self._tokenizer, + "optimizers": optimizers, + "eval_dataset": eval_dataset, + } + + return Seq2SeqTrainer(self._model, training_args, **trainer_arguments) From e9d21ffda5700e095e4db83b47f755e28a1e4b70 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sun, 16 Jul 2023 16:52:10 -0500 Subject: [PATCH 02/16] :construction: Work in progress causal-lm trainer Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 4 +- caikit_nlp/resources/pretrained_model/base.py | 32 +++++++++++--- .../pretrained_model/hf_auto_seq2seq_lm.py | 4 +- .../text_generation/test_fine_tuning.py | 43 +++++++++++++++++-- 4 files changed, 70 insertions(+), 13 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index ca726fa0..18951bfa 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -129,15 +129,13 @@ def train( "num_train_epochs": num_epochs, # NOTE: We have disabled evaluation for now "do_eval": False, - "# evaluation_strategy ": "epoch", + # "evaluation_strategy ": "epoch", "learning_rate": lr, "weight_decay": 0.01, "save_total_limit": 3, - "predict_with_generate": True, "fp16": True, "push_to_hub": False, "no_cuda": False, # Default - "generation_max_length": max_target_length, "remove_unused_columns": False, "dataloader_pin_memory": False, "gradient_accumulation_steps": accumulate_steps, diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index 3c9e26a7..f7e231f5 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -20,7 +20,7 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import AutoTokenizer, DataCollator, Trainer, TrainingArguments +from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments from transformers.models.auto.auto_factory import _BaseAutoModelClass import torch @@ -251,11 +251,8 @@ def get_trainer( training_args = TrainingArguments(**kwargs) - # TODO: Fetch DataCollator either from property of this - # class or fetch it as an argument. - data_collator = DataCollator(tokenizer=self._tokenizer, model=self._model) + data_collator = self._get_data_collator(**kwargs) - # pylint: disable=duplicate-code trainer_arguments = { "train_dataset": train_dataset, "data_collator": data_collator, @@ -266,6 +263,31 @@ def get_trainer( return Trainer(self._model, training_args, **trainer_arguments) + + def _get_data_collator(self, **kwargs): + """Function to return appropriate data collator based on resource. + + The default implementation of the base resource uses + DataCollatorWithPadding which will dynamically pad the inputs received. + + Args: + **kwargs: + All the keyword arguments passed to this function + will get filtered out to appropriate ones that are + applicable to implemented data collator. + Returns: + transformers.DataCollator + """ + + applicable_args = ["max_length", "pad_to_multiple_of"] + collator_kwargs = {key: kwargs[key] for key in applicable_args if key in kwargs} + + return DataCollatorWithPadding( + tokenizer=self._tokenizer, + padding=True, + **collator_kwargs + ) + # pylint: disable=unused-argument @classmethod def get_num_transformers_submodules( diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index 1a06cd9b..bf61dfcf 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -94,13 +94,15 @@ def get_trainer( tokenizer=self._tokenizer, model=self._model ) - # pylint: disable=duplicate-code trainer_arguments = { "train_dataset": train_dataset, "data_collator": data_collator, "tokenizer": self._tokenizer, "optimizers": optimizers, "eval_dataset": eval_dataset, + # Following only applicable for seq2seq + "predict_with_generate": True, + # "generation_max_length": max_target_length, } return Seq2SeqTrainer(self._model, training_args, **trainer_arguments) diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index e2491851..f82c79c3 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -9,8 +9,8 @@ # Local from caikit_nlp.data_model import GeneratedResult, GenerationTrainRecord from caikit_nlp.modules.text_generation import FineTuning -from caikit_nlp.resources.pretrained_model import HFAutoSeq2SeqLM -from tests.fixtures import SEQ2SEQ_LM_MODEL, disable_wip +from caikit_nlp.resources.pretrained_model import HFAutoCausalLM, HFAutoSeq2SeqLM +from tests.fixtures import CAUSAL_LM_MODEL, SEQ2SEQ_LM_MODEL, disable_wip @pytest.mark.skip( @@ -20,8 +20,9 @@ # run function """ ) -def test_train_model(disable_wip): - """Ensure that we can train a model on some toy data for 1+ steps & run inference.""" +def test_train_model_seq2seq(disable_wip): + """Ensure that we can finetune a seq2seq model on some toy data for 1+ + steps & run inference.""" train_kwargs = { "base_model": HFAutoSeq2SeqLM.bootstrap( model_name=SEQ2SEQ_LM_MODEL, tokenizer_name=SEQ2SEQ_LM_MODEL @@ -44,3 +45,37 @@ def test_train_model(disable_wip): # Ensure that we can get something out of it pred = model.run("@bar what a cute cat!") assert isinstance(pred, GeneratedResult) + + +# @pytest.mark.skip( +# """ +# We are skipping this test because we are waiting for new release +# of transformers library that includes bugfix that is currently breaking +# # run function +# """ +# ) +def test_train_model_causallm(disable_wip): + """Ensure that we can finetune a causal-lm model on some toy data for 1+ + steps & run inference.""" + train_kwargs = { + "base_model": HFAutoCausalLM.bootstrap( + model_name=CAUSAL_LM_MODEL, tokenizer_name=CAUSAL_LM_MODEL + ), + "num_epochs": 1, + "train_stream": caikit.core.data_model.DataStream.from_iterable( + [ + GenerationTrainRecord( + input="@foo what a cute dog!", output="no complaint" + ), + GenerationTrainRecord( + input="@bar this is the worst idea ever.", output="complaint" + ), + ] + ), + "torch_dtype": torch.float32, + } + model = FineTuning.train(**train_kwargs) + assert isinstance(model.model, Trainer) + # Ensure that we can get something out of it + pred = model.run("@bar what a cute cat!") + assert isinstance(pred, GeneratedResult) \ No newline at end of file From e067c6dd9ce06ea405adb1fd0052ab6c4536eb55 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 18 Jul 2023 15:13:27 -0500 Subject: [PATCH 03/16] :construction: Implement seq2seq collator in resources Signed-off-by: gkumbhat --- .../pretrained_model/hf_auto_seq2seq_lm.py | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index bf61dfcf..4ced36d9 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -90,9 +90,7 @@ def get_trainer( # TODO: Fetch DataCollator either from property of this # class or fetch it as an argument. - data_collator = DataCollatorForSeq2Seq( - tokenizer=self._tokenizer, model=self._model - ) + data_collator = self._get_data_collator(**kwargs) trainer_arguments = { "train_dataset": train_dataset, @@ -106,3 +104,27 @@ def get_trainer( } return Seq2SeqTrainer(self._model, training_args, **trainer_arguments) + + + def _get_data_collator(self, **kwargs): + """Function to return appropriate data collator based on resource. + + This implementation uses DataCollatorForSeq2Seq + + Args: + **kwargs: + All the keyword arguments passed to this function + will get filtered out to appropriate ones that are + applicable to implemented data collator. + Returns: + transformers.DataCollator + """ + + applicable_args = ["max_length", "pad_to_multiple_of"] + collator_kwargs = {key: kwargs[key] for key in applicable_args if key in kwargs} + + return DataCollatorForSeq2Seq( + tokenizer=self._tokenizer, + model=self._model, + **collator_kwargs + ) \ No newline at end of file From f539380121f6eac5c6c228eb41beb7beb6cc86e1 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sat, 29 Jul 2023 12:17:49 -0500 Subject: [PATCH 04/16] :art: Fix linting and formatting Signed-off-by: gkumbhat --- caikit_nlp/modules/text_generation/fine_tuning.py | 7 ++----- caikit_nlp/resources/pretrained_model/base.py | 12 +++++++----- .../resources/pretrained_model/hf_auto_seq2seq_lm.py | 7 +++---- tests/modules/text_generation/test_fine_tuning.py | 3 ++- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 91eef1ef..16cfda15 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -18,9 +18,6 @@ from transformers import ( AutoConfig, AutoTokenizer, - DataCollatorForSeq2Seq, - Seq2SeqTrainer, - Seq2SeqTrainingArguments, Trainer, ) import torch @@ -34,7 +31,7 @@ import alog # Local -from ...data_model import GeneratedTextResult, GenerationTrainRecord +from ...data_model import GenerationTrainRecord from ...resources.pretrained_model.base import PretrainedModelBase from ...toolkit.data_stream_wrapper import SimpleIterableStreamWrapper from ...toolkit.data_type_utils import get_torch_dtype @@ -82,7 +79,7 @@ def train( lr: float = 2e-5, # Directory where model predictions and checkpoints will be written checkpoint_dir: str = "/tmp", - **training_arguments + **training_arguments, ): """ # FIXME: Below is currently configured for Seq2Seq only diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index 13bc451b..6e9ea641 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -20,7 +20,12 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments +from transformers import ( + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) from transformers.models.auto.auto_factory import _BaseAutoModelClass import torch @@ -263,7 +268,6 @@ def get_trainer( return Trainer(self._model, training_args, **trainer_arguments) - def _get_data_collator(self, **kwargs): """Function to return appropriate data collator based on resource. @@ -283,9 +287,7 @@ def _get_data_collator(self, **kwargs): collator_kwargs = {key: kwargs[key] for key in applicable_args if key in kwargs} return DataCollatorWithPadding( - tokenizer=self._tokenizer, - padding=True, - **collator_kwargs + tokenizer=self._tokenizer, padding=True, **collator_kwargs ) # pylint: disable=unused-argument diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index aed29bb1..df43ea2d 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -91,6 +91,7 @@ def get_trainer( training_args = Seq2SeqTrainingArguments(**kwargs) + # pylint: disable=duplicate-code # TODO: Fetch DataCollator either from property of this # class or fetch it as an argument. data_collator = self._get_data_collator(**kwargs) @@ -108,7 +109,6 @@ def get_trainer( return Seq2SeqTrainer(self._model, training_args, **trainer_arguments) - def _get_data_collator(self, **kwargs): """Function to return appropriate data collator based on resource. @@ -127,10 +127,9 @@ def _get_data_collator(self, **kwargs): collator_kwargs = {key: kwargs[key] for key in applicable_args if key in kwargs} return DataCollatorForSeq2Seq( - tokenizer=self._tokenizer, - model=self._model, - **collator_kwargs + tokenizer=self._tokenizer, model=self._model, **collator_kwargs ) + @staticmethod def build_task_tokenize_function( tokenizer: "AutoTokenizer", diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index 9e54206d..64974425 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -64,6 +64,7 @@ def test_train_model_causallm(disable_wip): pred = model.run("@bar what a cute cat!") assert isinstance(pred, GeneratedTextResult) + ############################## Error Cases ################################ @@ -84,4 +85,4 @@ def test_zero_epoch_case(disable_wip): "torch_dtype": torch.float32, } model = FineTuning.train(**train_kwargs) - assert isinstance(model.model, Trainer) \ No newline at end of file + assert isinstance(model.model, Trainer) From e1c8f38ca0f56d2fd2176c791eec48ad39eabd76 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sat, 29 Jul 2023 12:26:23 -0500 Subject: [PATCH 05/16] :bug: Fix seq2seq training arguments Signed-off-by: gkumbhat --- .../resources/pretrained_model/hf_auto_seq2seq_lm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index df43ea2d..cd6843ff 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -89,7 +89,10 @@ def get_trainer( 4. preprocess_logits_for_metrics """ - training_args = Seq2SeqTrainingArguments(**kwargs) + training_args = Seq2SeqTrainingArguments( + predict_with_generate=True, + **kwargs + ) # pylint: disable=duplicate-code # TODO: Fetch DataCollator either from property of this @@ -102,8 +105,6 @@ def get_trainer( "tokenizer": self._tokenizer, "optimizers": optimizers, "eval_dataset": eval_dataset, - # Following only applicable for seq2seq - "predict_with_generate": True, # "generation_max_length": max_target_length, } From 1724b603019d7b080c0ff68182f8b3308300b373 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sat, 29 Jul 2023 16:06:58 -0500 Subject: [PATCH 06/16] :bug: Remove task ids from resource tokenization functions Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 42 ++++++------------- .../pretrained_model/hf_auto_causal_lm.py | 3 +- .../pretrained_model/hf_auto_seq2seq_lm.py | 8 ++-- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 16cfda15..8183c89e 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -15,11 +15,7 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import ( - AutoConfig, - AutoTokenizer, - Trainer, -) +from transformers import AutoConfig, AutoTokenizer, Trainer import torch # First Party @@ -114,6 +110,7 @@ def train( error.type_check("", PretrainedModelBase, base_model=base_model) ## Generate data loader from stream training_dataset: IterableDataset = cls._preprocess_function( + base_model=base_model, train_stream=train_stream, tokenizer=base_model.tokenizer, max_source_length=max_source_length, @@ -259,6 +256,7 @@ def run( @staticmethod def _preprocess_function( + base_model: PretrainedModelBase, train_stream: DataStream[GenerationTrainRecord], tokenizer: AutoTokenizer, max_source_length: int, @@ -267,28 +265,14 @@ def _preprocess_function( ): """Pre-process each example to get it prepared for training.""" - # FIXME: Below is currently configured for Seq2Seq only - - def _tokenization_func( - example: GenerationTrainRecord, - ): - model_inputs = tokenizer( - example.input, - max_length=max_source_length, - truncation=True, - ) - - labels = tokenizer( - example.output, - max_length=max_target_length, - padding="max_length", - truncation=True, - ) - - model_inputs["labels"] = labels["input_ids"] - - return model_inputs - - return SimpleIterableStreamWrapper( - train_stream.map(_tokenization_func), shuffle=shuffle + ( + tokenize_function, + requires_unwrapping, + ) = base_model.build_task_tokenize_function( + tokenizer, max_source_length, max_target_length, verbalizer="" ) + mapped_stream = train_stream.map(tokenize_function) + if requires_unwrapping: + mapped_stream = mapped_stream.flatten() + + return SimpleIterableStreamWrapper(mapped_stream, shuffle=shuffle) diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py index b98a2983..cefe7838 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py @@ -104,7 +104,8 @@ def tokenize_function_language_model( # Here, we need to yield and manipulate the attention mask to attend # to the input seq + the tokens we have seen so far... num_target_samples = len(target_ids.input_ids) - source_ids["task_ids"] = 0 + # TODO: Why do we need task ids here?? + # source_ids["task_ids"] = 0 def generator_func(): for idx in range(num_target_samples): diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index cd6843ff..6951f5fb 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -89,10 +89,7 @@ def get_trainer( 4. preprocess_logits_for_metrics """ - training_args = Seq2SeqTrainingArguments( - predict_with_generate=True, - **kwargs - ) + training_args = Seq2SeqTrainingArguments(predict_with_generate=True, **kwargs) # pylint: disable=duplicate-code # TODO: Fetch DataCollator either from property of this @@ -197,7 +194,8 @@ def tokenize_function_seq2seq( map(lambda x: IGNORE_ID if x == tokenizer.pad_token_id else x, labels) ) model_inputs["labels"] = labels - model_inputs["task_ids"] = 0 + # TODO: Why do we need task ids here?? + # model_inputs["task_ids"] = 0 return model_inputs return (tokenize_function_seq2seq, False) From 9a8f877b285b7ddc81e6544d673a67bd32bf27e4 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sun, 30 Jul 2023 14:18:37 -0500 Subject: [PATCH 07/16] :white_check_mark: Add cuda device fixture to get around cuda unit testing when available Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 1 - .../text_generation/peft_prompt_tuning.py | 26 ++++++++++++------- caikit_nlp/resources/pretrained_model/base.py | 5 ++++ .../pretrained_model/hf_auto_causal_lm.py | 12 ++++++--- .../pretrained_model/hf_auto_seq2seq_lm.py | 10 +++++-- tests/fixtures/__init__.py | 13 ++++++++++ .../text_generation/test_fine_tuning.py | 6 ++--- .../test_peft_prompt_tuning.py | 8 +++--- tests/resources/test_pretrained_model.py | 3 +++ 9 files changed, 63 insertions(+), 21 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 8183c89e..38e907df 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -153,7 +153,6 @@ def train( "learning_rate": lr, "weight_decay": 0.01, "save_total_limit": 3, - "fp16": True, "push_to_hub": False, "no_cuda": False, # Default "remove_unused_columns": False, diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py index 6b4e7e7f..60c6efcc 100644 --- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py +++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py @@ -173,7 +173,6 @@ def __del__(self): def run( self, text: str, - device: Optional[Union[str, int]] = _DETECT_DEVICE, max_new_tokens=20, min_new_tokens=0, ) -> GeneratedTextResult: @@ -182,8 +181,6 @@ def run( Args: text: str Input string to be used to the generation model. - device: Optional[Union[str, int]] - Device on which we should run inference; by default, we use the detected device. max_new_tokens: int The maximum numbers of tokens to generate. Default: 20 @@ -199,8 +196,8 @@ def run( verbalized_text = render_verbalizer(self.verbalizer, {"input": text}) # Apply the tokenizer to the sample text & move to correct device tok_tensors = self.tokenizer(verbalized_text, return_tensors="pt") - device = PeftPromptTuning._get_device(device) - inputs = {k: v.to(device) for k, v in tok_tensors.items()} + + inputs = {k: v.to(self.model.device) for k, v in tok_tensors.items()} with torch.no_grad(): # Run tokenized tensors through the rest of the PEFT model outputs = self.model.generate( @@ -604,7 +601,12 @@ def save(self, model_path: str, save_base_model: bool = False): module_saver.update_config(config_options) @classmethod - def load(cls, model_path: str, torch_dtype: str = None) -> "PeftPromptTuning": + def load( + cls, + model_path: str, + torch_dtype: str = None, + device: str = _DETECT_DEVICE, # TODO: Union[int, str] + ) -> "PeftPromptTuning": """Load a PEFT prompt tuning model. This method will currently fail if the original model was not saved with the arg value save_base_model=True. @@ -626,7 +628,7 @@ def load(cls, model_path: str, torch_dtype: str = None) -> "PeftPromptTuning": torch_dtype = str_to_torch_dtype(config.trained_torch_dtype) if config.has_base_model: # TODO: Implement logic for resource loading - device = cls._get_device(cls._DETECT_DEVICE) + device = cls._get_device(device) model_config = os.path.join(model_path, config.full_model_path) peft_config = PeftConfig.from_pretrained(model_config) if peft_config.task_type == "CAUSAL_LM": @@ -1005,7 +1007,7 @@ def _get_data_loaders_from_stream( tokenize_function, requires_unwrapping, ) = base_model.build_task_tokenize_function( - tokenizer, max_source_length, max_target_length, verbalizer + tokenizer, max_source_length, max_target_length, verbalizer, task_ids=0 ) mapped_stream = train_stream.map(tokenize_function) if requires_unwrapping: @@ -1066,7 +1068,13 @@ def _execute_train_loop( num_training_steps=(len(train_dataloader) * num_epochs), ) # Configure accelerator for gradient accumulation - accelerator = Accelerator(gradient_accumulation_steps=accumulate_steps) + accelerator_args = { + "gradient_accumulation_steps": accumulate_steps, + "device_placement": True + } + + accelerator = Accelerator(**accelerator_args) + for epoch in range(num_epochs): model.train() total_loss = 0 diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index 6e9ea641..e4a732b2 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -306,6 +306,7 @@ def build_task_tokenize_function( max_source_length: int, max_target_length: int, verbalizer: str, + task_ids: Union[None, int] = None, ) -> Tuple[Callable, bool]: """Builds tokenizer functions which can be mapped over train streams to process data which can then be easily passed to a DataLoader for different model types. @@ -320,6 +321,10 @@ def build_task_tokenize_function( verbalizer: str Verbalizer template to be used for formatting data. This template may use brackets to indicate where fields from the data model TrainGenerationRecord must be rendered. + task_ids: Union[None, int] + Task id corresponding particular task for multi-task prompt tuning. + NOTE: Only required for MPT (Multi-task prompt tuning) + Default: None Returns: Tuple(Callable, bool) diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py index cefe7838..6a832ac5 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py @@ -16,7 +16,7 @@ """ # Standard from copy import deepcopy -from typing import Callable, Tuple +from typing import Callable, Tuple, Union # Third Party from transformers import AutoModelForCausalLM @@ -52,6 +52,7 @@ def build_task_tokenize_function( max_source_length: int, max_target_length: int, verbalizer: str, + task_ids: Union[None, int] = None, ) -> Tuple[Callable, bool]: """Builds tokenizer functions which can be mapped over train streams to process data which can then be easily passed to a DataLoader for CausalLM models. @@ -66,6 +67,10 @@ def build_task_tokenize_function( verbalizer: str Verbalizer template to be used for formatting data. This template may use brackets to indicate where fields from the data model TrainGenerationRecord must be rendered. + task_ids: Union[None, int] + Task id corresponding particular task for multi-task prompt tuning. + NOTE: Only required for MPT (Multi-task prompt tuning) + Default: None Returns: Tuple(Callable, bool) @@ -104,8 +109,9 @@ def tokenize_function_language_model( # Here, we need to yield and manipulate the attention mask to attend # to the input seq + the tokens we have seen so far... num_target_samples = len(target_ids.input_ids) - # TODO: Why do we need task ids here?? - # source_ids["task_ids"] = 0 + + if task_ids is not None: + source_ids["task_ids"] = task_ids def generator_func(): for idx in range(num_target_samples): diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index 6951f5fb..880b6db1 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -134,6 +134,7 @@ def build_task_tokenize_function( max_source_length: int, max_target_length: int, verbalizer: str, + task_ids: Union[None, int] = None, ) -> Tuple[Callable, bool]: """Builds tokenizer functions which can be mapped over train streams to process data which can then be easily passed to a DataLoader for seq2seq models. @@ -148,6 +149,10 @@ def build_task_tokenize_function( verbalizer: str Verbalizer template to be used for formatting data. This template may use brackets to indicate where fields from the data model TrainGenerationRecord must be rendered. + task_ids: Union[None, int] + Task id corresponding particular task for multi-task prompt tuning. + NOTE: Only required for MPT (Multi-task prompt tuning) + Default: None Returns: Tuple(Callable, bool) @@ -194,8 +199,9 @@ def tokenize_function_seq2seq( map(lambda x: IGNORE_ID if x == tokenizer.pad_token_id else x, labels) ) model_inputs["labels"] = labels - # TODO: Why do we need task ids here?? - # model_inputs["task_ids"] = 0 + if task_ids is not None: + model_inputs["task_ids"] = task_ids + return model_inputs return (tokenize_function_seq2seq, False) diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index 2cc4f0b7..ffc8f187 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -32,6 +32,19 @@ SEQ2SEQ_LM_MODEL = os.path.join(TINY_MODELS_DIR, "T5ForConditionalGeneration") +@pytest.fixture() +def set_cpu_device(request): + """Fixture to set default cuda device. + This fixture is particularly useful for running the unit tests where + cuda devices are available, in which case, some transformers function + may try to consume cuda and give device mismatch error. + """ + visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "") + os.environ["CUDA_VISIBLE_DEVICES"] = "" + with mock.patch.object(torch.cuda, 'is_available', return_value=False): + yield + os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices + @pytest.fixture def disable_wip(request): """Fixture to temporarily disable wip decorator""" diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index 64974425..458e8794 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -11,10 +11,10 @@ from caikit_nlp.data_model import GenerationTrainRecord from caikit_nlp.modules.text_generation import FineTuning from caikit_nlp.resources.pretrained_model import HFAutoCausalLM, HFAutoSeq2SeqLM -from tests.fixtures import CAUSAL_LM_MODEL, SEQ2SEQ_LM_MODEL, disable_wip +from tests.fixtures import CAUSAL_LM_MODEL, SEQ2SEQ_LM_MODEL, disable_wip, set_cpu_device -def test_train_model_seq2seq(disable_wip): +def test_train_model_seq2seq(disable_wip, set_cpu_device): """Ensure that we can finetune a seq2seq model on some toy data for 1+ steps & run inference.""" train_kwargs = { @@ -41,7 +41,7 @@ def test_train_model_seq2seq(disable_wip): assert isinstance(pred, GeneratedTextResult) -def test_train_model_causallm(disable_wip): +def test_train_model_causallm(disable_wip, set_cpu_device): """Ensure that we can finetune a causal-lm model on some toy data for 1+ steps & run inference.""" train_kwargs = { diff --git a/tests/modules/text_generation/test_peft_prompt_tuning.py b/tests/modules/text_generation/test_peft_prompt_tuning.py index 8ce87ff2..22361b9d 100644 --- a/tests/modules/text_generation/test_peft_prompt_tuning.py +++ b/tests/modules/text_generation/test_peft_prompt_tuning.py @@ -28,6 +28,7 @@ from tests.fixtures import ( causal_lm_dummy_model, causal_lm_train_kwargs, + set_cpu_device, seq2seq_lm_dummy_model, seq2seq_lm_train_kwargs, ) @@ -36,8 +37,9 @@ # Indexes into the peft config dictionary to get the actual prompt tuning config DEFAULT_ADAPTER = "default" + ### Tests validating block interfaces and behavior -def test_save_and_reload_with_base_model(causal_lm_dummy_model): +def test_save_and_reload_with_base_model(causal_lm_dummy_model, set_cpu_device): """Ensure that we can save a model + its base to a tempdir and reload it.""" with tempfile.TemporaryDirectory() as model_dir: causal_lm_dummy_model.save(model_dir, save_base_model=True) @@ -109,7 +111,7 @@ def test_verbalizer_cannot_be_static(causal_lm_train_kwargs): ) -def test_train_model(causal_lm_train_kwargs): +def test_train_model(causal_lm_train_kwargs, set_cpu_device): """Ensure that we can train a model on some toy data for 1+ steps & run inference.""" patch_kwargs = { "num_epochs": 1, @@ -138,7 +140,7 @@ def test_train_model(causal_lm_train_kwargs): assert isinstance(pred, GeneratedTextResult) -def test_train_model_classification_record(causal_lm_train_kwargs): +def test_train_model_classification_record(causal_lm_train_kwargs, set_cpu_device): """Ensure that we can train a model on some toy data for 1+ steps & run inference.""" patch_kwargs = { "num_epochs": 1, diff --git a/tests/resources/test_pretrained_model.py b/tests/resources/test_pretrained_model.py index 0b377e28..04d5062e 100644 --- a/tests/resources/test_pretrained_model.py +++ b/tests/resources/test_pretrained_model.py @@ -128,6 +128,7 @@ def test_causal_lm_tok_output_correctness(models_cache_dir): max_source_length=100, max_target_length=100, verbalizer="{{input}}", + task_ids=0 ) input_tok = causal_lm.tokenizer.encode(sample.input) output_tok = causal_lm.tokenizer.encode(sample.output) @@ -170,6 +171,7 @@ def test_seq2seq_tokenize_func_contains_unwrapped_stream(models_cache_dir): max_source_length=100, max_target_length=100, verbalizer="{{input}}", + task_ids=0, ) tok_res = tok_func(GenerationTrainRecord(input="hello", output="world")) map_stream = SAMPLE_TRAINING_DATA.map(tok_func) @@ -195,6 +197,7 @@ def test_seq2seq_tok_output_correctness(models_cache_dir): max_source_length=20, max_target_length=20, verbalizer="{{input}}", + task_ids=0, ) input_tok = seq2seq.tokenizer.encode(sample.input) output_tok = seq2seq.tokenizer.encode(sample.output) From 0c2df95fcee982c8771315fc77c21107644117cc Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sun, 30 Jul 2023 14:19:25 -0500 Subject: [PATCH 08/16] :art: Fix formatting Signed-off-by: gkumbhat --- caikit_nlp/modules/text_generation/peft_prompt_tuning.py | 4 ++-- tests/fixtures/__init__.py | 3 ++- tests/modules/text_generation/test_fine_tuning.py | 7 ++++++- tests/modules/text_generation/test_peft_prompt_tuning.py | 2 +- tests/resources/test_pretrained_model.py | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py index 60c6efcc..4f4c67a7 100644 --- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py +++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py @@ -606,7 +606,7 @@ def load( model_path: str, torch_dtype: str = None, device: str = _DETECT_DEVICE, # TODO: Union[int, str] - ) -> "PeftPromptTuning": + ) -> "PeftPromptTuning": """Load a PEFT prompt tuning model. This method will currently fail if the original model was not saved with the arg value save_base_model=True. @@ -1070,7 +1070,7 @@ def _execute_train_loop( # Configure accelerator for gradient accumulation accelerator_args = { "gradient_accumulation_steps": accumulate_steps, - "device_placement": True + "device_placement": True, } accelerator = Accelerator(**accelerator_args) diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index ffc8f187..324cdee3 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -41,10 +41,11 @@ def set_cpu_device(request): """ visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "") os.environ["CUDA_VISIBLE_DEVICES"] = "" - with mock.patch.object(torch.cuda, 'is_available', return_value=False): + with mock.patch.object(torch.cuda, "is_available", return_value=False): yield os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices + @pytest.fixture def disable_wip(request): """Fixture to temporarily disable wip decorator""" diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index 458e8794..d3c42741 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -11,7 +11,12 @@ from caikit_nlp.data_model import GenerationTrainRecord from caikit_nlp.modules.text_generation import FineTuning from caikit_nlp.resources.pretrained_model import HFAutoCausalLM, HFAutoSeq2SeqLM -from tests.fixtures import CAUSAL_LM_MODEL, SEQ2SEQ_LM_MODEL, disable_wip, set_cpu_device +from tests.fixtures import ( + CAUSAL_LM_MODEL, + SEQ2SEQ_LM_MODEL, + disable_wip, + set_cpu_device, +) def test_train_model_seq2seq(disable_wip, set_cpu_device): diff --git a/tests/modules/text_generation/test_peft_prompt_tuning.py b/tests/modules/text_generation/test_peft_prompt_tuning.py index 22361b9d..907338d0 100644 --- a/tests/modules/text_generation/test_peft_prompt_tuning.py +++ b/tests/modules/text_generation/test_peft_prompt_tuning.py @@ -28,9 +28,9 @@ from tests.fixtures import ( causal_lm_dummy_model, causal_lm_train_kwargs, - set_cpu_device, seq2seq_lm_dummy_model, seq2seq_lm_train_kwargs, + set_cpu_device, ) import caikit_nlp diff --git a/tests/resources/test_pretrained_model.py b/tests/resources/test_pretrained_model.py index 04d5062e..d7e5a748 100644 --- a/tests/resources/test_pretrained_model.py +++ b/tests/resources/test_pretrained_model.py @@ -128,7 +128,7 @@ def test_causal_lm_tok_output_correctness(models_cache_dir): max_source_length=100, max_target_length=100, verbalizer="{{input}}", - task_ids=0 + task_ids=0, ) input_tok = causal_lm.tokenizer.encode(sample.input) output_tok = causal_lm.tokenizer.encode(sample.output) From 9230f4e7c665be8034a94897344918a007cbd45d Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Sun, 30 Jul 2023 18:24:58 -0500 Subject: [PATCH 09/16] :white_check_mark::sparkles: Make fine-tuning work for causal lm models and make tests pass Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 28 ++++++++++++----- .../pretrained_model/hf_auto_causal_lm.py | 31 ++++++++++++++++++- .../text_generation/test_fine_tuning.py | 6 ++-- 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 38e907df..0f50bd68 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -15,7 +15,7 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import AutoConfig, AutoTokenizer, Trainer +from transformers import AutoConfig, AutoTokenizer, Seq2SeqTrainer, Trainer import torch # First Party @@ -228,13 +228,27 @@ def run( # and thus the device placement be according to training strategy, # its better to let Trainer handle the evaluation / prediction - # TODO: Add support for passing extra arguments to prediction_step + generate_args = { + "prediction_loss_only": False, + } + if isinstance(self.model, Seq2SeqTrainer): + generate_args["max_new_tokens"] = max_new_tokens + generate_args["min_new_tokens"] = min_new_tokens + else: + # NOTE: Currently the default trainer doesn't support easy way to run individual + # samples without converting them into Datasets etc. There is a + # predict_with_generate flag, but it doesn't do anything. + # Applicable for transformers==4.31.0 + error( + "", + NotImplementedError( + f"Generation on {type(self.model)} not support \ + currently! Please try saving and running this model in TGIS." + ), + ) + _, generated_tokens, _ = self.model.prediction_step( - self.model.model, - tok_tensors, - prediction_loss_only=False, - max_new_tokens=max_new_tokens, - min_new_tokens=min_new_tokens, + self.model.model, tok_tensors, **generate_args ) generated_text = self.tokenizer.batch_decode( diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py index 6a832ac5..30c0be20 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_causal_lm.py @@ -19,7 +19,7 @@ from typing import Callable, Tuple, Union # Third Party -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling from transformers.models.auto import modeling_auto # First Party @@ -129,3 +129,32 @@ def generator_func(): return DataStream(generator_func) return (tokenize_function_language_model, True) + + def _get_data_collator(self, **kwargs): + """Function to return appropriate data collator based on resource. + + DataCollatorForLanguageModeling is used here which will dynamically + padded to maximum length of a batch if they are not all of the same + length. + + NOTE: If mlm (masked language modeling) is not passed in kwargs, + this function will automatically set it to `False`. + + Args: + **kwargs: + All the keyword arguments passed to this function + will get filtered out to appropriate ones that are + applicable to implemented data collator. + Returns: + transformers.DataCollator + """ + + applicable_args = ["mlm", "pad_to_multiple_of"] + collator_kwargs = {key: kwargs[key] for key in applicable_args if key in kwargs} + + if "mlm" not in collator_kwargs: + collator_kwargs["mlm"] = False + + return DataCollatorForLanguageModeling( + tokenizer=self._tokenizer, return_tensors="pt", **collator_kwargs + ) diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index d3c42741..64060961 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -65,9 +65,9 @@ def test_train_model_causallm(disable_wip, set_cpu_device): } model = FineTuning.train(**train_kwargs) assert isinstance(model.model, Trainer) - # Ensure that we can get something out of it - pred = model.run("@bar what a cute cat!") - assert isinstance(pred, GeneratedTextResult) + + with pytest.raises(NotImplementedError): + model.run("@bar what a cute cat!") ############################## Error Cases ################################ From f46897389966a02ebda5927f97dbaf797e70c747 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 1 Aug 2023 12:48:49 -0500 Subject: [PATCH 10/16] :wrench: Make review changes and add docstring for arguments Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 37 ++++++++++++++++++- .../text_generation/peft_prompt_tuning.py | 15 ++++---- caikit_nlp/resources/pretrained_model/base.py | 4 ++ .../pretrained_model/hf_auto_seq2seq_lm.py | 4 ++ scripts/dump_apis.sh | 14 +++---- 5 files changed, 59 insertions(+), 15 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 0f50bd68..5ed10c96 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -78,7 +78,42 @@ def train( **training_arguments, ): """ - # FIXME: Below is currently configured for Seq2Seq only + Fine-tune a CausalLM or Seq2seq text generation model. + + Args: + base_model: Union[str, caikit_nlp.resources.pretrained_model.base.PretrainedModelBase] + Base resource model used for underlying generation. + train_stream: DataStream[GenerationTrainRecord] or DataStream[ClassificationTrainRecord] + Data to be used for training the prompt vectors of the generation model. + torch_dtype: str + TODO: Optional[Union[torch.dtype, str]] + Data type to use for training/inference of the underlying text generation model. + If no value is provided, we pull from torch_dtype in config. If an in memory + resource is provided which does not match the specified data type, the model + underpinning the resource will be converted in place to the correct torch dtype. + max_source_length: int + Max length of input sequences being considered. Default: 256. + max_target_length: int + Max length of target sequences being predicted. Default: 128. + batch_size: int + Batch sized to be used for training / evaluation data. Default: 8. + num_epochs: int + Number of epochs to tune the model. Default: 20. + accumulate_steps: int + Number of steps to use for gradient accumulation. Default: 1. + lr: float + Learning rate to be used while tuning model. Default: 2e-5. + checkpoint_dir: str + Directory where model predictions and checkpoints will be written + **training_arguments: + Arguments supported by HF Training Arguments. + TrainingArguments: + https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/trainer#transformers.TrainingArguments + Seq2SeqTrainingArguments: + https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments + Returns: + FineTuning + Instance of this class with fine-tuned models. """ torch_dtype = get_torch_dtype(torch_dtype) diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py index 4f4c67a7..2f4049c8 100644 --- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py +++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py @@ -173,6 +173,7 @@ def __del__(self): def run( self, text: str, + device: Optional[Union[str, int]] = _DETECT_DEVICE, max_new_tokens=20, min_new_tokens=0, ) -> GeneratedTextResult: @@ -181,6 +182,8 @@ def run( Args: text: str Input string to be used to the generation model. + device: Optional[Union[str, int]] + Device on which we should run inference; by default, we use the detected device. max_new_tokens: int The maximum numbers of tokens to generate. Default: 20 @@ -197,7 +200,8 @@ def run( # Apply the tokenizer to the sample text & move to correct device tok_tensors = self.tokenizer(verbalized_text, return_tensors="pt") - inputs = {k: v.to(self.model.device) for k, v in tok_tensors.items()} + device = PeftPromptTuning._get_device(device) + inputs = {k: v.to(device) for k, v in tok_tensors.items()} with torch.no_grad(): # Run tokenized tensors through the rest of the PEFT model outputs = self.model.generate( @@ -1067,13 +1071,10 @@ def _execute_train_loop( num_warmup_steps=0, num_training_steps=(len(train_dataloader) * num_epochs), ) - # Configure accelerator for gradient accumulation - accelerator_args = { - "gradient_accumulation_steps": accumulate_steps, - "device_placement": True, - } - accelerator = Accelerator(**accelerator_args) + accelerator = Accelerator( + gradient_accumulation_steps=accumulate_steps, device_placement=True + ) for epoch in range(num_epochs): model.train() diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index e4a732b2..87491917 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -247,6 +247,10 @@ def get_trainer( **kwargs, ): """ + Args: + *kwargs: arguments supported by HF TrainingArguments: + https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/trainer#transformers.TrainingArguments + NOTE: following parameters are not supported currently: 1. model_init 2. compute_metrics diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index 880b6db1..fdea36d8 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -82,6 +82,10 @@ def get_trainer( **kwargs ): """ + Args: + *kwargs: arguments supported by HF Seq2SeqTrainingArguments: + https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments + NOTE: following parameters are not supported currently: 1. model_init 2. compute_metrics diff --git a/scripts/dump_apis.sh b/scripts/dump_apis.sh index f277a827..249704f4 100755 --- a/scripts/dump_apis.sh +++ b/scripts/dump_apis.sh @@ -1,18 +1,18 @@ #!/usr/bin/env bash # Make a directory with interfaces -http_interface_dir="generated_interfaces/http" -grpc_interface_dir="generated_interfaces/grpc" +http_interface_dir="temp_dump/http" +grpc_interface_dir="temp_dump/grpc" mkdir -p $http_interface_dir mkdir -p $grpc_interface_dir # Run the HTTP server in the background -RUNTIME_LIBRARY=caikit_nlp python -m caikit.runtime.http_server & -http_pid=$! +# RUNTIME_LIBRARY=caikit_nlp python -m caikit.runtime.http_server & +# http_pid=$! -# Sleep for a bit and then call it to get the swagger doc -sleep 5 -curl http://localhost:8080/openapi.json | jq > $http_interface_dir/openapi.json +# # Sleep for a bit and then call it to get the swagger doc +# sleep 5 +# curl http://localhost:8080/openapi.json | jq > $http_interface_dir/openapi.json # Kill the HTTP server and wait for it to die kill -9 $http_pid From 4eda03b3a655d9688304b4981b07e5ef2355389e Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 1 Aug 2023 14:10:44 -0500 Subject: [PATCH 11/16] :sparkles: Add support for model.generate right after training by saving and loading the model Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 126 +++++++++--------- .../text_generation/test_fine_tuning.py | 9 +- 2 files changed, 70 insertions(+), 65 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 5ed10c96..0a70866f 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Standard +from typing import Optional # Third Party from torch.utils.data import IterableDataset @@ -28,7 +30,7 @@ # Local from ...data_model import GenerationTrainRecord -from ...resources.pretrained_model.base import PretrainedModelBase +from ...resources.pretrained_model import PretrainedModelBase, HFAutoCausalLM, HFAutoSeq2SeqLM from ...toolkit.data_stream_wrapper import SimpleIterableStreamWrapper from ...toolkit.data_type_utils import get_torch_dtype @@ -49,17 +51,25 @@ class FineTuning(ModuleBase): """Module to provide fine-tuning support for text generation task""" - def __init__(self, tokenizer, model): + supported_resources = [HFAutoCausalLM, HFAutoSeq2SeqLM] + + def __init__( + self, + tokenizer, + model, + bos_token: Optional[str] = None, + sep_token: Optional[str] = None, + eos_token: Optional[str] = None, + pad_token: Optional[str] = None, + ): super().__init__() self.tokenizer = tokenizer - # NOTE: self.model here can also be HF trainer. This is because - # if we have just trained the model then the models weights might be - # available in different devices (and configuration), depending on - # how it was trained. For now (July 10, 2023), we are not trying to - # extract the model out from trainer itself, since that would require - # us to essentially save it or reconstruct it to do normal inferring. self.model = model + self._bos_token = bos_token + self._sep_token = sep_token + self._eos_token = eos_token + self._pad_token = pad_token @classmethod def train( @@ -122,11 +132,12 @@ def train( # text_generation module. In future, we would want to consolidate this into # a base class or a toolkit function # pylint: disable=duplicate-code + resource_type = None + ## Load base model if isinstance(base_model, str): model_config = AutoConfig.from_pretrained(base_model) - resource_type = None for resource in cls.supported_resources: if model_config.model_type in resource.SUPPORTED_MODEL_TYPES: resource_type = resource @@ -140,8 +151,13 @@ def train( ), ) log.debug("Bootstrapping base resource [%s]", base_model) + breakpoint() base_model = resource_type.bootstrap(base_model, torch_dtype=torch_dtype) + else: + # base_model is actually a resource object + resource_type = type(base_model) + error.type_check("", PretrainedModelBase, base_model=base_model) ## Generate data loader from stream training_dataset: IterableDataset = cls._preprocess_function( @@ -217,17 +233,23 @@ def train( # Start training via Trainer.train function trainer.train() - # NOTE: By default the model would be available in different ways - # depending on where and how it was trained. So we need to fetch the model - # from the trainer depending on the training method, like fsdp, ddp etc. - # For simplicity, currently we will use trainer as the model since it anyways - # enable the `predict` function on it and has all the layers of the model - # distributed already, so it will be most optimized to use trainer to - # perform prediction at this stage. + + # save the model temporarily and reload it + # this is done, since otherwise the model might be distributed in different + # devices, in which case its better to use trainer's `prediction_step` + # functions, but then, they don't always give API similar to `generate` + # and thus cause incompatibilities in `run` function + trainer.save_model(checkpoint_dir) + + model = resource_type.bootstrap(checkpoint_dir, checkpoint_dir, torch_dtype=torch_dtype) return cls( - tokenizer=base_model.tokenizer, - model=trainer, + tokenizer=model.tokenizer, + model=model, + bos_token=model.tokenizer.bos_token or None, + sep_token=model.tokenizer.sep_token or None, + eos_token=model.tokenizer.eos_token or None, + pad_token=model.tokenizer.pad_token or None, ) # pylint: disable=unused-argument @@ -252,53 +274,35 @@ def run( GeneratedTextResult Generated text result """ - if isinstance(self.model, Trainer): - # Apply the tokenizer to the sample text & move to correct device - tok_tensors = self.tokenizer(text, return_tensors="pt") - # NOTE: below function is prediction on trainer, for which we need to supply - # the actual underlying model as well - # NOTE: We are using prediction_step instead of calling `self.model.generate` - # because this way HF Trainer automatically handles device placement of the - # data and model. Since the model is with Trainer at this point - # and thus the device placement be according to training strategy, - # its better to let Trainer handle the evaluation / prediction - - generate_args = { - "prediction_loss_only": False, - } - if isinstance(self.model, Seq2SeqTrainer): - generate_args["max_new_tokens"] = max_new_tokens - generate_args["min_new_tokens"] = min_new_tokens - else: - # NOTE: Currently the default trainer doesn't support easy way to run individual - # samples without converting them into Datasets etc. There is a - # predict_with_generate flag, but it doesn't do anything. - # Applicable for transformers==4.31.0 - error( - "", - NotImplementedError( - f"Generation on {type(self.model)} not support \ - currently! Please try saving and running this model in TGIS." - ), - ) - - _, generated_tokens, _ = self.model.prediction_step( - self.model.model, tok_tensors, **generate_args - ) - generated_text = self.tokenizer.batch_decode( - generated_tokens.detach().cpu().numpy(), skip_special_tokens=True - )[0] + inputs = self.model.tokenizer(text, return_tensors="pt") + generate_ids = self.model.model.generate( + input_ids=inputs["input_ids"], + max_new_tokens=max_new_tokens, + min_new_tokens=min_new_tokens, + use_cache=True, + ) - else: - error( - "", - NotImplementedError( - "model prediction on pre-finetuned model currently not supported" - ), + token_count = generate_ids.size(1) - 1 + preds = [ + self.model.tokenizer.decode( + g, skip_special_tokens=True, clean_up_tokenization_spaces=True ) + for g in generate_ids + ] + if generate_ids[0][-1].item() == self._eos_token: + finish_reason = "EOS_TOKEN" + elif generate_ids.size(1) - 1 == max_new_tokens: + finish_reason = "MAX_TOKENS" + else: + finish_reason = "OTHER" - return GeneratedTextResult(generated_text=generated_text) + return GeneratedTextResult( + generated_tokens=token_count, + generated_text=preds[0], + finish_reason=finish_reason, + producer_id=self.PRODUCER_ID, + ) ################################## Private Functions ########################################### diff --git a/tests/modules/text_generation/test_fine_tuning.py b/tests/modules/text_generation/test_fine_tuning.py index 64060961..a17f5ffa 100644 --- a/tests/modules/text_generation/test_fine_tuning.py +++ b/tests/modules/text_generation/test_fine_tuning.py @@ -40,7 +40,7 @@ def test_train_model_seq2seq(disable_wip, set_cpu_device): "torch_dtype": torch.float32, } model = FineTuning.train(**train_kwargs) - assert isinstance(model.model, Trainer) + assert isinstance(model.model, HFAutoSeq2SeqLM) # Ensure that we can get something out of it pred = model.run("@bar what a cute cat!") assert isinstance(pred, GeneratedTextResult) @@ -64,10 +64,11 @@ def test_train_model_causallm(disable_wip, set_cpu_device): "torch_dtype": torch.float32, } model = FineTuning.train(**train_kwargs) - assert isinstance(model.model, Trainer) + assert isinstance(model.model, HFAutoCausalLM) - with pytest.raises(NotImplementedError): - model.run("@bar what a cute cat!") + # Ensure that we can get something out of it + pred = model.run("@bar what a cute cat!") + assert isinstance(pred, GeneratedTextResult) ############################## Error Cases ################################ From ed7bbe639a197b309ff52b6c28076f4fdc7cf996 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 1 Aug 2023 14:11:46 -0500 Subject: [PATCH 12/16] :art: Fix formatting and linting Signed-off-by: gkumbhat --- .../modules/text_generation/fine_tuning.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 0a70866f..dab6a596 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -17,7 +17,7 @@ # Third Party from torch.utils.data import IterableDataset -from transformers import AutoConfig, AutoTokenizer, Seq2SeqTrainer, Trainer +from transformers import AutoConfig, AutoTokenizer import torch # First Party @@ -30,7 +30,11 @@ # Local from ...data_model import GenerationTrainRecord -from ...resources.pretrained_model import PretrainedModelBase, HFAutoCausalLM, HFAutoSeq2SeqLM +from ...resources.pretrained_model import ( + HFAutoCausalLM, + HFAutoSeq2SeqLM, + PretrainedModelBase, +) from ...toolkit.data_stream_wrapper import SimpleIterableStreamWrapper from ...toolkit.data_type_utils import get_torch_dtype @@ -54,14 +58,14 @@ class FineTuning(ModuleBase): supported_resources = [HFAutoCausalLM, HFAutoSeq2SeqLM] def __init__( - self, - tokenizer, - model, - bos_token: Optional[str] = None, - sep_token: Optional[str] = None, - eos_token: Optional[str] = None, - pad_token: Optional[str] = None, - ): + self, + tokenizer, + model, + bos_token: Optional[str] = None, + sep_token: Optional[str] = None, + eos_token: Optional[str] = None, + pad_token: Optional[str] = None, + ): super().__init__() self.tokenizer = tokenizer @@ -151,7 +155,6 @@ def train( ), ) log.debug("Bootstrapping base resource [%s]", base_model) - breakpoint() base_model = resource_type.bootstrap(base_model, torch_dtype=torch_dtype) else: @@ -241,7 +244,9 @@ def train( # and thus cause incompatibilities in `run` function trainer.save_model(checkpoint_dir) - model = resource_type.bootstrap(checkpoint_dir, checkpoint_dir, torch_dtype=torch_dtype) + model = resource_type.bootstrap( + checkpoint_dir, checkpoint_dir, torch_dtype=torch_dtype + ) return cls( tokenizer=model.tokenizer, From 489076146150190fc0c4172065690f7c5bcf7e43 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 1 Aug 2023 15:04:31 -0500 Subject: [PATCH 13/16] :bug: Fix default verbalizer declaration in fine_tuning module Signed-off-by: gkumbhat --- caikit_nlp/modules/text_generation/fine_tuning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index dab6a596..2ea8561f 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -322,11 +322,14 @@ def _preprocess_function( ): """Pre-process each example to get it prepared for training.""" + # TODO: We are using a default verbalizer which is strictly tied to + # source training record currently. We need to figure out a better + # way to make verbalizer optional for build_task_tokenize_function ( tokenize_function, requires_unwrapping, ) = base_model.build_task_tokenize_function( - tokenizer, max_source_length, max_target_length, verbalizer="" + tokenizer, max_source_length, max_target_length, verbalizer="{{input}}" ) mapped_stream = train_stream.map(tokenize_function) if requires_unwrapping: From d3d962c35b6629b57848242805b636cfed62e20f Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Tue, 1 Aug 2023 17:26:43 -0500 Subject: [PATCH 14/16] :wrench: Update parameters for trainer and add random seed Signed-off-by: gkumbhat --- caikit_nlp/modules/text_generation/fine_tuning.py | 4 ++++ caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 2ea8561f..3c277bc0 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -55,6 +55,7 @@ class FineTuning(ModuleBase): """Module to provide fine-tuning support for text generation task""" + RANDOM_SEED = 73 supported_resources = [HFAutoCausalLM, HFAutoSeq2SeqLM] def __init__( @@ -86,6 +87,7 @@ def train( batch_size: int = 8, num_epochs: int = 5, accumulate_steps: int = 32, + random_seed: int = RANDOM_SEED, lr: float = 2e-5, # Directory where model predictions and checkpoints will be written checkpoint_dir: str = "/tmp", @@ -201,6 +203,7 @@ def train( "per_device_train_batch_size": batch_size, "per_device_eval_batch_size": batch_size, "num_train_epochs": num_epochs, + "seed": random_seed, # NOTE: We have disabled evaluation for now "do_eval": False, # "evaluation_strategy ": "epoch", @@ -214,6 +217,7 @@ def train( "gradient_accumulation_steps": accumulate_steps, "eval_accumulation_steps": accumulate_steps, # eval_steps=1, + # load_best_model_at_end **training_arguments, **dtype_based_params, } diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py index fdea36d8..bdd69aa1 100644 --- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py +++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py @@ -93,7 +93,8 @@ def get_trainer( 4. preprocess_logits_for_metrics """ - training_args = Seq2SeqTrainingArguments(predict_with_generate=True, **kwargs) + # NOTE: predict_with_generate is incompatible with fsdp + training_args = Seq2SeqTrainingArguments(**kwargs) # pylint: disable=duplicate-code # TODO: Fetch DataCollator either from property of this From f84a357097444d439435c4a11126356e0a0eef78 Mon Sep 17 00:00:00 2001 From: gkumbhat Date: Wed, 2 Aug 2023 17:45:38 -0500 Subject: [PATCH 15/16] :rewind: Revert back dump_api script changes Signed-off-by: gkumbhat --- scripts/dump_apis.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/dump_apis.sh b/scripts/dump_apis.sh index 249704f4..f277a827 100755 --- a/scripts/dump_apis.sh +++ b/scripts/dump_apis.sh @@ -1,18 +1,18 @@ #!/usr/bin/env bash # Make a directory with interfaces -http_interface_dir="temp_dump/http" -grpc_interface_dir="temp_dump/grpc" +http_interface_dir="generated_interfaces/http" +grpc_interface_dir="generated_interfaces/grpc" mkdir -p $http_interface_dir mkdir -p $grpc_interface_dir # Run the HTTP server in the background -# RUNTIME_LIBRARY=caikit_nlp python -m caikit.runtime.http_server & -# http_pid=$! +RUNTIME_LIBRARY=caikit_nlp python -m caikit.runtime.http_server & +http_pid=$! -# # Sleep for a bit and then call it to get the swagger doc -# sleep 5 -# curl http://localhost:8080/openapi.json | jq > $http_interface_dir/openapi.json +# Sleep for a bit and then call it to get the swagger doc +sleep 5 +curl http://localhost:8080/openapi.json | jq > $http_interface_dir/openapi.json # Kill the HTTP server and wait for it to die kill -9 $http_pid From 664a3d56b848f75a5147285a1caf2faf4e0a0ff6 Mon Sep 17 00:00:00 2001 From: Gaurav Kumbhat Date: Wed, 2 Aug 2023 17:48:31 -0500 Subject: [PATCH 16/16] Apply suggestions from code review Co-authored-by: Alex Brooks Signed-off-by: Gaurav Kumbhat Signed-off-by: gkumbhat --- caikit_nlp/modules/text_generation/fine_tuning.py | 2 +- caikit_nlp/resources/pretrained_model/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/caikit_nlp/modules/text_generation/fine_tuning.py b/caikit_nlp/modules/text_generation/fine_tuning.py index 3c277bc0..f3933516 100644 --- a/caikit_nlp/modules/text_generation/fine_tuning.py +++ b/caikit_nlp/modules/text_generation/fine_tuning.py @@ -100,7 +100,7 @@ def train( base_model: Union[str, caikit_nlp.resources.pretrained_model.base.PretrainedModelBase] Base resource model used for underlying generation. train_stream: DataStream[GenerationTrainRecord] or DataStream[ClassificationTrainRecord] - Data to be used for training the prompt vectors of the generation model. + Data to be used for fine-tuning the generation model. torch_dtype: str TODO: Optional[Union[torch.dtype, str]] Data type to use for training/inference of the underlying text generation model. diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py index 87491917..59bb4d45 100644 --- a/caikit_nlp/resources/pretrained_model/base.py +++ b/caikit_nlp/resources/pretrained_model/base.py @@ -248,7 +248,7 @@ def get_trainer( ): """ Args: - *kwargs: arguments supported by HF TrainingArguments: + **kwargs: arguments supported by HF TrainingArguments: https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/trainer#transformers.TrainingArguments NOTE: following parameters are not supported currently: