diff --git a/examples/commonsense_qa/README.md b/examples/commonsense_qa/README.md new file mode 100644 index 0000000..c2524e4 --- /dev/null +++ b/examples/commonsense_qa/README.md @@ -0,0 +1,117 @@ +# commonsense_qa + +Author: Jingcheng Hu, hujc22@mails.tsinghua.edu.cn, https://reign12.github.io/ + +Student ID: 2022312848 + +## Task Description +### Dataset Statistics +We are following the train-validation-test split from [Huggingface commonsense_qa](https://huggingface.co/datasets/commonsense_qa). +However, note that for test set the answers are not provided, so we are not using test set in our experiments. There are 9741 samples in training set and 1221 samples in validation set. + +### Task Introduction +For task prompt, we are using prompt templates from [promptsource](https://github.com/bigscience-workshop/promptsource). +The task of commonsense_qa is a multiple-choice question answering challenge requiring rich world knowledges. +Here is an example: +```python +{ + 'id': '075e483d21c29a511267ef62bedc0461', + 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', + 'question_concept': 'punishing', + 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], + 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}, + 'answerKey': 'A' +} +``` + +## How to Train and Eval +### Dependency +You can activate your own conda env and run command +```bash +bash env_setup.sh cuda # If you are running on nvidia GPUs + +bash env_setup.sh rocm # If you are running on amd GPUs +``` +### Training and Evaluation +You can run `python main.py --help` or directly go to `./config/basic.yaml` to see all the supported configuration. + +To run the distributed training, which will evaluate the results along the way; per step loss, per epoch loss and per epoch accuracy will be recorded: +```bash +torchrun --nproc_per_node main.py \ + task="pc" \ # this is a Prompted Choice task + data.dataset="commonsense_qa" \ + model.name="BAAI/glm-roberta-large" \ # we also support bert-large-uncased, roberta-large + data.prompt_id="2" \ # prompt_id of original_task=True prompt templates from promptsource; for the name of each prompt, you can refer to training log as you start the job, which will be like "train dataset prompt_key ['answer_given_question_without_options', 'most_suitable_answer', 'question_answering', 'question_to_answer_index']" + jobname= \ + debug=False \ # If you want to disable wandb, set debug=True; you can setup your wandb related var as env var, or just type it when the program need it; refer to logger.py for details + optimizer.lr="1r-5" \ # no lr scaling will be done, this lr will be the final lr + trainer.batch="32" \ # this is the total batch summed in all cards + trainer.accumulate_steps="2" \ # we support gradient accumulate steps to have larger effective batch size + trainer.epochs="10" trainer.warmup_epochs="1" # we use linear warmup and cosine decay + # there are some more configs can be changed, please refer to ./config/basic.yaml for details and simply follow the pattern here +``` + +## Results +Report final performance and other methods. + +The final epoch accuracy using above commands are 64.22% on validation set. +For RoBERTa-Large and BERT-Large, we tuning the learning rate and the best performances are 74.59% and 62.90% for each. + +| Model | Accuracy | +|:---:|:---:| +glm-roberta-large | 64.22 | +roberta-large | 74.59 | +bert-large-uncased | 62.90 | + +## Reference +### Dataset +commonsense_qa dataset paper +```latex +@inproceedings{Talmor2019, + title = {{{CommonsenseQA}}: {{A Question Answering Challenge Targeting Commonsense Knowledge}}}, + shorttitle = {{{CommonsenseQA}}}, + booktitle = {Proceedings of the 2019 {{Conference}} of the {{North American Chapter}} of the {{Association}} for {{Computational Linguistics}}: {{Human Language Technologies}}, {{Volume}} 1 ({{Long}} and {{Short Papers}})}, + author = {Talmor, Alon and Herzig, Jonathan and Lourie, Nicholas and Berant, Jonathan}, + date = {2019-06}, + pages = {4149--4158}, + publisher = {{Association for Computational Linguistics}}, + location = {{Minneapolis, Minnesota}}, + doi = {10.18653/v1/N19-1421}, + url = {https://aclanthology.org/N19-1421}, + urldate = {2023-01-07}, + eventtitle = {{{NAACL-HLT}} 2019} +} +``` +commonsense_qa huggingface link are mentioned in above. + +roberta and bert we are directly using huggingface implementation. The original papers are: +```latex +@misc{Liu2019, + title = {{{RoBERTa}}: {{A Robustly Optimized BERT Pretraining Approach}}}, + shorttitle = {{{RoBERTa}}}, + author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin}, + date = {2019-07-26}, + number = {arXiv:1907.11692}, + eprint = {1907.11692}, + eprinttype = {arxiv}, + primaryclass = {cs}, + publisher = {{arXiv}}, + url = {http://arxiv.org/abs/1907.11692}, + urldate = {2023-01-07}, + archiveprefix = {arXiv}, + version = {1} +} + +@unpublished{devlinBERTPretrainingDeep2019, + title = {{{BERT}}: {{Pre-training}} of {{Deep Bidirectional Transformers}} for {{Language Understanding}}}, + shorttitle = {{{BERT}}}, + author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, + date = {2019-05-24}, + eprint = {1810.04805}, + eprinttype = {arxiv}, + primaryclass = {cs}, + url = {http://arxiv.org/abs/1810.04805}, + urldate = {2022-04-12}, + archiveprefix = {arXiv} +} +``` \ No newline at end of file diff --git a/examples/commonsense_qa/config/basic.yaml b/examples/commonsense_qa/config/basic.yaml new file mode 100644 index 0000000..e090421 --- /dev/null +++ b/examples/commonsense_qa/config/basic.yaml @@ -0,0 +1,46 @@ +defaults: + - _self_ + - override hydra/hydra_logging: disabled + - override hydra/job_logging: disabled + +hydra: + output_subdir: null + run: + dir: . + +debug: False +jobname: test +task: "pc" # pg:generation, pc:choice +model: + # for pg: BAAI/glm-roberta-large,t5-large + # for pc: BAAI/glm-roberta-large, bert-large-uncased, roberta-large + name: "BAAI/glm-roberta-large" + max_length: 384 # dataset related + max_gen_length: 128 # dataset related +data: + dataset: "multi_news" # currently support commonsense_qa, multi_news + # dataset: "commonsense_qa" # currently support commonsense_qa, multi_news + tokenizer: ${model.name} + max_length: ${model.max_length} + max_gen_length: ${model.max_gen_length} + prompt_id: 0 # id in promptsource original_task=True name_l + answer_prompt: "Answer:" +optimizer: + lr: 1e-5 + beta1: 0.9 + beta2: 0.999 + wd: 0.01 +trainer: + batch: 64 # batch in total + accumulate_steps: 1 + epochs: 10 + lrscheduler: cosine + warmup_start: 1e-7 + warmup_epochs: 1 + num_workers: 1 # num_workers in total + pin_memory: True + log_interval: 100 + qualitative_num: 5 + checkpoint_dir: "./checkpoints" # path for ckps +distributed: + backend: "nccl" diff --git a/examples/commonsense_qa/data.py b/examples/commonsense_qa/data.py new file mode 100644 index 0000000..2f4557e --- /dev/null +++ b/examples/commonsense_qa/data.py @@ -0,0 +1,218 @@ +from typing import Dict,Tuple,List +from torch import Tensor + +import torch +from torch.utils.data import Dataset +from omegaconf import DictConfig + +# Support GLM, BART, T5 +from transformers import AutoTokenizer +# Support commonsense_qa, multi_news +from datasets import load_dataset + +# prompt support +from promptsource.templates import DatasetTemplates + +from einops import rearrange + +class PCDataCollator: + def __init__(self,datacollator_config: DictConfig): + self.datacollator_config = datacollator_config + self.tokenizer = AutoTokenizer.from_pretrained(self.datacollator_config.tokenizer,trust_remote_code=True) + self.collator = self.build_collator() + def __call__(self, batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + return self.collator(batch) + def build_collator(self): + if "glm" in self.datacollator_config.tokenizer: + return self.glm_collator + elif "roberta" in self.datacollator_config.tokenizer: + return self.roberta_collator + elif 'bert' in self.datacollator_config.tokenizer: + return self.roberta_collator + else: + raise NotImplementedError("Not implemented yet") + def roberta_collator(self,batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + prompt_l = [] + choice_l = [] + choice_ids = [] + for item in batch: + for choice in item[1]: + prompt_l.append(item[0]) + choice_l.append(choice) + choice_ids.append(item[2]) + res = self.tokenizer(prompt_l,choice_l, + return_tensors="pt", + padding=True,truncation=True,max_length=self.datacollator_config.max_length) + for key in res: + res[key] = rearrange(res[key],'(b c) l -> b c l',b=len(batch),c=len(item[1])) + + labels = torch.tensor(choice_ids) + res['labels'] = labels + return res + + def glm_collator(self,batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + prompts,choices_l,choice_ids = zip(*batch) + prompts = self.tokenizer(prompts,return_tensors="pt", + padding=True,truncation=True,max_length=self.datacollator_config.max_length + ) + res = self.tokenizer.build_inputs_for_multiple_choice(prompts,choices_l) + labels = torch.tensor(choice_ids) + res['labels'] = labels + return res + +class PCDataset(Dataset): + def __init__(self,dataset_config: DictConfig,split:str): + self.dataset_config = dataset_config + self.dataset = load_dataset(*dataset_config.dataset.split("/"),split=split) + self.prompt_key,self.prompter = self.build_prompter() + self.adapter = self.build_adapter() + + def __len__(self): + return len(self.dataset) + def __getitem__(self, index: int) -> Tuple[str,List[str],int]: + data = self.dataset[index] + prompt,choice = self.prompter.apply(data) + choices_l = self.prompter.get_answer_choices_list(data) + choice_id = choices_l.index(choice) + prompt = prompt + "\n\n" + self.dataset_config.answer_prompt + res = self.adapter(prompt,choices_l,choice_id) + return res + def build_adapter(self): + if "glm" in self.dataset_config.tokenizer: + return self.glm_adapter + elif "roberta" in self.dataset_config.tokenizer: + return self.roberta_adapter + elif 'bert' in self.dataset_config.tokenizer: + return self.roberta_adapter + else: + raise NotImplementedError("Not implemented yet") + def roberta_adapter(self,prompt:str,choices_l:List[str],choice_id:int) -> Tuple[str,List[str],int]: + return prompt,choices_l,choice_id + def glm_adapter(self,prompt:str,choices_l:List[str],choice_id:int) -> Tuple[str,List[str],int]: + prompt += "[MASK]" + return prompt,choices_l,choice_id + def build_prompter(self): + all_prompts = DatasetTemplates(self.dataset_config.dataset) + # filter out those not original_task + prompt_key = [name for name in all_prompts.all_template_names if all_prompts[name].metadata.original_task ] + prompter = all_prompts[prompt_key[self.dataset_config.prompt_id]] + return prompt_key,prompter + +class PGDataCollator: + def __init__(self,datacollator_config: DictConfig,split:str): + self.datacollator_config = datacollator_config + self.split = split + self.tokenizer = AutoTokenizer.from_pretrained(self.datacollator_config.tokenizer,trust_remote_code=True) + self.collator = self.build_collator() + def build_collator(self): + if "glm" in self.datacollator_config.tokenizer: + if self.split == "train": + return self.glm_train_collator + else: + return self.glm_test_collator + elif "t5" in self.datacollator_config.tokenizer: + if self.split == "train": + return self.t5_train_collator + else: + return self.t5_test_collator + else: + raise NotImplementedError("Not implemented yet") + def t5_train_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + self.tokenizer.truncation_side = 'left' + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res['labels'] = self.tokenizer(answers,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt")['input_ids'] + return res + def t5_test_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + self.tokenizer.truncation_side = 'left' + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res['labels'] = [answer[len(" "):] for answer in answers] # rm the prepended + res['prompts'] = prompts + return res + def glm_train_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res = self.tokenizer.build_inputs_for_generation(res,targets=answers,max_gen_length=self.datacollator_config.max_gen_length) + return res + def glm_test_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res = self.tokenizer.build_inputs_for_generation(res,max_gen_length=self.datacollator_config.max_gen_length) + res['labels'] = answers + res['prompts'] = prompts + return res + + def __call__(self, batch: List[Tuple[str,str]]) -> Dict[str, Tensor]: + return self.collator(batch) + +class PGDataset(Dataset): + def __init__(self,dataset_config:DictConfig,split:str): + """ + split = "train" or "validation" or "test" + """ + self.dataset_config = dataset_config + self.max_length = dataset_config.max_length + self.max_gen_length = dataset_config.max_gen_length + + self.dataset = load_dataset(*dataset_config.dataset.split("/"),split=split) + self.prompt_key,self.prompter = self.build_prompter() + self.answer_prompt = dataset_config.answer_prompt + self.adapter = self.build_adapter() + + + def build_adapter(self): + adapter_name = self.dataset_config.tokenizer + if "glm" in adapter_name: + adapter = self.glm_adapter + elif "t5" in adapter_name: + adapter = self.t5_adapter + elif "bart" in adapter_name: + adapter = self.bart_adapter + else: + raise NotImplementedError(f"Adapter {adapter_name} is not supported") + return adapter + + def glm_adapter(self,prompted_data:Tuple[str,str])->Tuple[str,str]: + prompt,answer = prompted_data + # add mask token + prompt += "[MASK]" + res = prompt,answer + return res + + def t5_adapter(self,prompted_data): + prompt,answer = prompted_data + # add sentinel token for prompt and answer + prompt = f'{prompt} ' + answer = f' {answer}' + return prompt,answer + return res + + def build_prompter(self): + all_prompts = DatasetTemplates(self.dataset_config.dataset) + # filter out those not original_task + prompt_key = [name for name in all_prompts.all_template_names if all_prompts[name].metadata.original_task ] + prompter = all_prompts[prompt_key[self.dataset_config.prompt_id]] + return prompt_key,prompter + + def __len__(self)->int: + return len(self.dataset) + def __getitem__(self, index:int)->Tuple[str,str]: + # TODO: format the data using prompt, add mask token based on model, padding based on max_lenght, then pass the tokenizer + data = self.dataset[index] + prompted_data = self.prompter.apply(data) + prompted_data[0] = prompted_data[0] + "\n\n" + self.answer_prompt + res = self.adapter(prompted_data) + return res + + + +if __name__ == "__main__": + dataset = load_dataset("commonsense_qa") + print(dataset.keys()) + dataset = load_dataset("multi_news") + print(dataset.keys()) + + # multi_news_prompts = DatasetTemplates("multi_news") + # print(multi_news_prompts.all_template_names) + diff --git a/examples/commonsense_qa/env_setup.sh b/examples/commonsense_qa/env_setup.sh new file mode 100644 index 0000000..d77a0ff --- /dev/null +++ b/examples/commonsense_qa/env_setup.sh @@ -0,0 +1,13 @@ +# Support cuda and rocm environment +# for cuda we test on V100 and A100 +# for rocm we test on MI100, the image tested with building with pytorch-1.11.0-rocm5.1.3 + +GPU_ENV="$1" +if [ $GPU_ENV = "cuda" ] || [ $GPU_ENV = "rocm" ]; then + echo "Installing $GPU_ENV environment" + pip install -r "requirements_torch_${GPU_ENV}.txt" + pip install -r requirements.txt + pip install numpy tqdm -U +else + echo "Unsupported environment $GPU_ENV" +fi \ No newline at end of file diff --git a/examples/commonsense_qa/logger.py b/examples/commonsense_qa/logger.py new file mode 100644 index 0000000..659c3c6 --- /dev/null +++ b/examples/commonsense_qa/logger.py @@ -0,0 +1,49 @@ +import os +import sys +from typing import Dict +from omegaconf import OmegaConf,DictConfig +import wandb +import loguru + +class Logger: + def __init__(self, cfg:DictConfig): + self.cfg = cfg + self.logger = loguru.logger + self.logger.remove() + self.logger.add(f"{self.cfg.jobname}.log") + + def log_rank(self,data:Dict): + log_str = "\t".join(f"{key} {data[key]}" for key in data) + self.logger.info(log_str) + + def log_master(self,data:Dict,if_wandb:bool=True): + """ + only called in master process + """ + log_str = "\t".join(f"{key} {data[key]}" for key in data) + self.logger.info(log_str) + if self.cfg.debug != True and if_wandb: + wandb.log(data) + + def login(self): + """ + only login once in distributed training + """ + self.logger.add(sys.stderr) + # if you do not want to use wandb, you can set self.cfg.debug = True + if self.cfg.debug != True: + # you can use export VAR=VALUE to set environment variables before running the script + wandb_var_l = ["WANDB_API_KEY","WANDB_ENTITY","WANDB_PROJECT"] + for wandb_var in wandb_var_l: + if os.environ.get(wandb_var) is None: + os.environ[wandb_var] = input(f"Please input your {wandb_var}:") + wandb.login(key=os.environ['WANDB_API_KEY']) + wandb.init(project=os.environ['WANDB_PROJECT'], entity=os.environ['WANDB_ENTITY'], + name=self.cfg.jobname,config=OmegaConf.to_container(self.cfg,resolve=True)) + # wandb.config.update(self.cfg) + + self.logger.info("\nConfigs are:\n" + OmegaConf.to_yaml(self.cfg)) + +if __name__ == "__main__": + mylogger = Logger() + mylogger.log({"loss":0.1,"acc":0.9}) diff --git a/examples/commonsense_qa/lrscheduler.py b/examples/commonsense_qa/lrscheduler.py new file mode 100644 index 0000000..d41ba00 --- /dev/null +++ b/examples/commonsense_qa/lrscheduler.py @@ -0,0 +1,17 @@ +from omegaconf import DictConfig +from torch.optim.lr_scheduler import CosineAnnealingLR +from ignite.handlers import create_lr_scheduler_with_warmup + +def build_lrscheduler(optimizer, trainer_cfg:DictConfig,steps_per_epoch:int): + if trainer_cfg.lrscheduler == "cosine": + lrscheduler = CosineAnnealingLR( + optimizer, T_max=(trainer_cfg.epochs-trainer_cfg.warmup_epochs)*steps_per_epoch, + eta_min=trainer_cfg.warmup_start # set annealing end lr to warmup start lr + ) + lrscheduler = create_lr_scheduler_with_warmup( + lrscheduler,warmup_start_value=trainer_cfg.warmup_start, + warmup_duration=trainer_cfg.warmup_epochs*steps_per_epoch + ) + else: + raise NotImplementedError + return lrscheduler diff --git a/examples/commonsense_qa/main.py b/examples/commonsense_qa/main.py new file mode 100644 index 0000000..fec5f98 --- /dev/null +++ b/examples/commonsense_qa/main.py @@ -0,0 +1,237 @@ +import warnings +import hydra +from omegaconf import DictConfig + +from logger import Logger +from data import PGDataset,PGDataCollator,PCDataset,PCDataCollator +from model import PGModel, PCModel + +import torch +from torch import optim + +from typing import Tuple,List +from torch import Tensor + +import ignite.distributed as idist +from ignite.engine import Engine,Events +from ignite import metrics +from lrscheduler import build_lrscheduler +from ignite.handlers import Checkpoint, global_step_from_engine + +def main_engine(local_rank: int, cfg: DictConfig,**kwargs): + # ignore warnings + if not cfg.debug: + warnings.filterwarnings("ignore") + # Setup logger + logger = Logger(cfg) + if idist.get_rank() == 0: + logger.login() + logger.log_rank({"rank":idist.get_rank(),"local_rank":local_rank,"world_size":idist.get_world_size()}) + + # Setup model + if cfg.task == "pg": + model = PGModel(cfg.model) + elif cfg.task == "pc": + model = PCModel(cfg.model) + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + model = idist.auto_model(model) + # Setup optimizer + optimizer = optim.AdamW(model.parameters(), + lr=cfg.optimizer.lr,betas=(cfg.optimizer.beta1,cfg.optimizer.beta2), + weight_decay=cfg.optimizer.wd, + ) + optimizer = idist.auto_optim(optimizer) + + def train_step(engine:Engine, batch:Tensor) -> Tensor: + model.train() + batch.to(idist.device()) + loss = model(batch).loss + loss = loss / cfg.trainer.accumulate_steps # accumulate gradients + loss.backward() + if engine.state.iteration % cfg.trainer.accumulate_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + + def pg_test_evaluate_step(engine:Engine, batch:Tensor) -> Tuple[List[str],List[str]]: + model.eval() + prompts = batch.pop("prompts") + labels = batch.pop("labels") + with torch.no_grad(): + batch.to(idist.device()) + if idist.get_world_size() > 1: + res = model.module.generate(batch) + else: + res = model.generate(batch) + return res, labels + + def pc_test_evaluate_step(engine:Engine, batch:Tensor) -> Tuple[List[str],List[str]]: + model.eval() + with torch.no_grad(): + batch.to(idist.device()) + labels = batch.pop("labels") + res = model(batch).logits + return res, labels + + # setup engines + trainer = Engine(train_step) + if cfg.task == "pg": + test_evaluate_step = pg_test_evaluate_step + elif cfg.task == "pc": + test_evaluate_step = pc_test_evaluate_step + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + test_evaluator = Engine(test_evaluate_step) + + # metrics + train_loss = metrics.Average() + train_loss.attach(trainer,"train_loss") + if cfg.task == "pg": + def rouge_output_transform(output:Tuple[List[str],List[str]]) -> Tuple[List[List[str]],List[List[List[str]]]]: + res,labels = output + res = [item.split() for item in res] + labels = [[item.split()] for item in labels] + return res,labels + test_rouge = metrics.Rouge(variants=['L',1,2],output_transform=rouge_output_transform) + test_rouge.attach(test_evaluator,"test_rouge") + elif cfg.task == "pc": + test_acc = metrics.Accuracy() + test_acc.attach(test_evaluator,"test_acc") + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + + @trainer.on(Events.COMPLETED) + def log_final_results(engine:Engine): + log_data = { "epoch":engine.state.epoch, } + # test evaluation of rouge is too slow, so we only evaluate it at the end + test_evaluator.run(test_dataloader) + test_metrics = test_evaluator.state.metrics + if cfg.task == "pg": + log_data["test_rouge"] = test_metrics["test_rouge"] + elif cfg.task == "pc": + log_data["test_acc"] = test_metrics["test_acc"] + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + # log test evaluation + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + return log_data + + # @trainer.on(Events.EPOCH_STARTED) # for debug + @trainer.on(Events.EPOCH_COMPLETED) + def log_epoch_results(engine:Engine): + log_data = { "epoch":engine.state.epoch, } + # train evaluation + train_metrics = engine.state.metrics + log_data["train_loss"] = train_metrics["train_loss"] + + model.eval() + if cfg.task == "pg": + # qualitative study first + qualitative_num = cfg.trainer.qualitative_num + qualitative_log_data = {} + qualitative_rouge = metrics.Rouge() # calculate rouge for qualitative study + with torch.no_grad(): + for batch in test_dataloader: + prompts = batch.pop("prompts") + labels = batch.pop("labels") + batch.to(idist.device()) + if idist.get_world_size() > 1: + res = model.module.generate(batch) + else: + res = model.generate(batch) + res = res[:qualitative_num] + labels = labels[:qualitative_num] + # calculcate rouge for qulitative study examples + qualitative_rouge.update(([item.split() for item in res],[[item.split()] for item in labels])) + qualitative_log_data["qualitative_rouge"] = qualitative_rouge.compute() + # log qualitative study examples + for idx,(prompt,model_res,label) in enumerate(zip(prompts,res,labels)): + qualitative_log_data[f"qualitative_{idx}"] = { + "prompt":prompt, + "model_res":model_res, + "label":label, + } + break + # log qualitative study + if idist.get_rank() == 0: + logger.log_master(qualitative_log_data,if_wandb=False) + elif cfg.task == "pc": + # test accuracy + test_evaluator.run(test_dataloader) + acc_metrics = test_evaluator.state.metrics + log_data['test_acc'] = acc_metrics['test_acc'] + + # log train evaluation + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + return log_data + + @trainer.on(Events.ITERATION_COMPLETED(every=cfg.trainer.log_interval)) + def log_step_results(engine:Engine): + log_data = { + "iteration":engine.state.iteration, + "loss_per_step":engine.state.output, + "lr":optimizer.param_groups[0]["lr"], + } + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + + if cfg.task=="pg": + train_data_collator = PGDataCollator(cfg.data,"train") + train_dataset = PGDataset(cfg.data,split="train") + elif cfg.task=="pc": + train_data_collator = PCDataCollator(cfg.data) + train_dataset = PCDataset(cfg.data,split="train") + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + if idist.get_rank() == 0: + logger.log_master({ + "train dataset prompt_key":f"{train_dataset.prompt_key}" + },if_wandb=False) + + train_dataloader = idist.auto_dataloader( + train_dataset,batch_size=cfg.trainer.batch, + num_workers=cfg.trainer.num_workers, + pin_memory=cfg.trainer.pin_memory, + collate_fn=train_data_collator, + shuffle=True,drop_last=True, + ) + if cfg.task=="pg": + test_data_collator = PGDataCollator(cfg.data,"test") + test_dataset = PGDataset(cfg.data,split="test") + elif cfg.task=="pc": + test_data_collator = PCDataCollator(cfg.data) + test_dataset = PCDataset(cfg.data,split="validation") + test_dataloader = idist.auto_dataloader( + test_dataset,batch_size=cfg.trainer.batch, + num_workers=cfg.trainer.num_workers, + pin_memory=cfg.trainer.pin_memory, + collate_fn=test_data_collator, + shuffle=False,drop_last=False, + ) + + lrscheduler = build_lrscheduler(optimizer,cfg.trainer,len(train_dataset)//cfg.trainer.batch) + trainer.add_event_handler(Events.ITERATION_STARTED,lrscheduler) + + # checkpointing; distributed is automatically handled + to_save = {"model":model,"optimizer":optimizer,"trainer":trainer} + checkpoint_dir = f"{cfg.trainer.checkpoint_dir}/{cfg.jobname}" + checkpoint = Checkpoint(to_save,checkpoint_dir)#,global_step_from_engine(trainer)) + test_evaluator.add_event_handler(Events.STARTED,checkpoint) + + trainer.run(train_dataloader,max_epochs=cfg.trainer.epochs) + +@hydra.main(version_base=None,config_path="config", config_name="basic") +def main(cfg: DictConfig) -> None: + backend = cfg.distributed.backend + with idist.Parallel(backend=backend) as parallel: + parallel.run(main_engine, cfg) + return + +if __name__ == '__main__': + main() diff --git a/examples/commonsense_qa/model.py b/examples/commonsense_qa/model.py new file mode 100644 index 0000000..6f54ada --- /dev/null +++ b/examples/commonsense_qa/model.py @@ -0,0 +1,66 @@ +import re + +from torch import nn +from transformers import AutoModelForSeq2SeqLM,AutoTokenizer,AutoModelForMultipleChoice + +from typing import Dict,List +from omegaconf import DictConfig +from torch import Tensor + +class PCModel(nn.Module): + """ + Prompted Choice Model + """ + def __init__(self,model_config:DictConfig) -> None: + super(PCModel,self).__init__() + self.model_config = model_config + self.model = AutoModelForMultipleChoice.from_pretrained(model_config.name,trust_remote_code=True) + + def forward(self,data:Dict[str,Tensor]): + res = self.model(**data) + return res + +class PGModel(nn.Module): + """ + Prompted Generation Model + """ + def __init__(self,model_config:DictConfig): + super(PGModel,self).__init__() + self.model_config = model_config + self.model = AutoModelForSeq2SeqLM.from_pretrained(model_config.name,trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(model_config.name,trust_remote_code=True) + self.generator = self.build_generator() + def forward(self,data:Dict[str,Tensor]): + res = self.model(**data) + return res + def generate(self,data:Dict[str,Tensor],**kwargs)->List[str]: + return self.generator(data,**kwargs) + def t5_generator(self,data:Dict[str,Tensor],**kwargs)->List[str]: + res = self.model.generate(**data, + max_length=self.model_config.max_gen_length, **kwargs) + res = self.tokenizer.batch_decode(res.tolist()) + pattern = r"(|)*(.*?)(|\Z|)" + res = [re.search(pattern,txt,re.DOTALL).group(2).strip() for txt in res] + return res + def glm_generator(self,data:Dict[str,Tensor],**kwargs)->List[str]: + res = self.model.generate(**data, + max_new_tokens=self.model_config.max_gen_length, + eos_token_id=self.tokenizer.eop_token_id, + pad_token_id=self.tokenizer.pad_token_id, + **kwargs) + res = self.tokenizer.batch_decode(res.tolist()) + pattern = r"<\|startofpiece\|>(.*?)(\Z|<\|endofpiece\|>)" + res = [re.search(pattern,txt,re.DOTALL).group(1).strip() for txt in res] + return res + + def build_generator(self): + if "glm" in self.model_config.name: + return self.glm_generator + elif "t5" in self.model_config.name: + return self.t5_generator + else: + raise NotImplementedError("Not implemented yet") + + +if __name__ == "__main__": + pass diff --git a/examples/commonsense_qa/requirements.txt b/examples/commonsense_qa/requirements.txt new file mode 100644 index 0000000..38fc6db --- /dev/null +++ b/examples/commonsense_qa/requirements.txt @@ -0,0 +1,13 @@ +transformers +datasets +sentencepiece +pytorch-ignite + +promptsource + +einops + +wandb +loguru +hydra-core +omegaconf \ No newline at end of file diff --git a/examples/commonsense_qa/requirements_torch_cuda.txt b/examples/commonsense_qa/requirements_torch_cuda.txt new file mode 100644 index 0000000..9b19b8e --- /dev/null +++ b/examples/commonsense_qa/requirements_torch_cuda.txt @@ -0,0 +1,4 @@ +-f https://download.pytorch.org/whl/torch_stable.html +torch==1.11.0+cu113 +torchvision==0.12.0+cu113 +torchaudio==0.11.0 \ No newline at end of file diff --git a/examples/commonsense_qa/requirements_torch_rocm.txt b/examples/commonsense_qa/requirements_torch_rocm.txt new file mode 100644 index 0000000..d2b7cc2 --- /dev/null +++ b/examples/commonsense_qa/requirements_torch_rocm.txt @@ -0,0 +1,6 @@ +# actually using amlt-sing rocm5.1.3, with torch1.11.0 installed already + +# -f https://download.pytorch.org/whl/rocm4.5.2 +# torch==1.11.0+rocm4.5.2 +# torchvision==0.12.0+rocm4.5.2 +# torchaudio==0.11.0 \ No newline at end of file diff --git a/examples/multi_news/README.md b/examples/multi_news/README.md new file mode 100644 index 0000000..d1d1e21 --- /dev/null +++ b/examples/multi_news/README.md @@ -0,0 +1,89 @@ +# Dataset Name as Title + +Author: Jingcheng Hu, hujc22@mails.tsinghua.edu.cn, https://reign12.github.io/ + +Student ID: 2022312848 + +## Task Description +### Dataset Statistics +We are following the train-validation-test split from [Huggingface multi_news](https://huggingface.co/datasets/multi_news). +The train-validation-test split are: training (80\%, 44,972), validation (10\%, 5,622), and test (10\%, 5,622) sets. +Note that in multi_news, the summaries are notably longer than in other works, about 260 words on average(the original texts are obiviously even longer), posing hard challenges for GLM-RoBERTa-Large or other similar models whose maximum length is 512, which makes it extremely difficult to cover all points in the original text while generating sufficiently long summary. + +### Task Introduction +For task prompt, we are using prompt templates from [promptsource](https://github.com/bigscience-workshop/promptsource) +The task of multi_news is summarization task from multiple documents. Here is an example: +```python +{ + "document": "some line val \n another line", + "summary": "target val line" +} +``` + +## How to Train and Eval +### Dependency +You can activate your own conda env and run command +```bash +bash env_setup.sh cuda # If you are running on nvidia GPUs + +bash env_setup.sh rocm # If you are running on amd GPUs +``` +### Training and Evaluation +You can run `python main.py --help` or directly go to `./config/basic.yaml` to see all the supported configuration. + +To run the distributed training, which will evaluate the results along the way; per step loss, per epoch loss and final Rouge scores will be recorded: +```bash +torchrun --nproc_per_node main.py \ + task="pg" \ # this is a Prompted Genertation task + data.dataset="multi_news" \ + model.name="BAAI/glm-roberta-large" \ # we also support bert-large-uncased, roberta-large + data.prompt_id="0" \ # prompt_id of original_task=True prompt templates from promptsource; for the name of each prompt, you can refer to training log as you start the job, which will be like "train dataset prompt_key ['distill', 'summarize', 'summary scenario', 'synthesize', 'what are the key points']" + jobname= \ + debug=False \ # If you want to disable wandb, set debug=True; you can setup your wandb related var as env var, or just type it when the program need it; refer to logger.py for details + optimizer.lr="5r-5" \ # no lr scaling will be done, this lr will be the final lr + trainer.batch="8" \ # this is the total batch summed in all cards + trainer.accumulate_steps="4" \ # we support gradient accumulate steps to have larger effective batch size + trainer.warmup_start="1e-8" \ # warmup_lr at start + trainer.epochs="3" trainer.warmup_epochs="1" # we use linear warmup and cosine decay + # there are some more configs can be changed, please refer to ./config/basic.yaml for details and simply follow the pattern here +``` + +## Results +We use Rouge-1-{P,R} as the metrics +The final epoch Rouge-1-{P,R} using above commands are 45.19,22.95 on test set. +For T5-Large, we tuning the learning rate and the best performances are 46.15,20.56. + +|Model|Rouge-1-P | Rouge-1-R | +|:---:|:---:| +|glm-roberta-large | 45.19|22.95 +|t5-large |46.15|20.56 + +## Reference +multi_news dataset paper: +```bibtex +@misc{fabbriMultiNewsLargeScaleMultiDocument2019, + title = {Multi-{{News}}: A {{Large-Scale Multi-Document Summarization Dataset}} and {{Abstractive Hierarchical Model}}}, + shorttitle = {Multi-{{News}}}, + author = {Fabbri, Alexander R. and Li, Irene and She, Tianwei and Li, Suyi and Radev, Dragomir R.}, + date = {2019-06-19}, + number = {arXiv:1906.01749}, + eprint = {1906.01749}, + eprinttype = {arxiv}, + primaryclass = {cs}, + publisher = {{arXiv}}, + doi = {10.48550/arXiv.1906.01749}, + url = {http://arxiv.org/abs/1906.01749}, + urldate = {2023-01-02}, + archiveprefix = {arXiv} +} + +``` +For T5 we are following the huggingface implementation, and the original paper is: +```bibtex +@article{raffelExploringLimitsTransfer, + title = {Exploring the {{Limits}} of {{Transfer Learning}} with a {{Unified Text-to-Text Transformer}}}, + author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J}, + pages = {67}, + langid = {english} +} +``` \ No newline at end of file diff --git a/examples/multi_news/config/basic.yaml b/examples/multi_news/config/basic.yaml new file mode 100644 index 0000000..e090421 --- /dev/null +++ b/examples/multi_news/config/basic.yaml @@ -0,0 +1,46 @@ +defaults: + - _self_ + - override hydra/hydra_logging: disabled + - override hydra/job_logging: disabled + +hydra: + output_subdir: null + run: + dir: . + +debug: False +jobname: test +task: "pc" # pg:generation, pc:choice +model: + # for pg: BAAI/glm-roberta-large,t5-large + # for pc: BAAI/glm-roberta-large, bert-large-uncased, roberta-large + name: "BAAI/glm-roberta-large" + max_length: 384 # dataset related + max_gen_length: 128 # dataset related +data: + dataset: "multi_news" # currently support commonsense_qa, multi_news + # dataset: "commonsense_qa" # currently support commonsense_qa, multi_news + tokenizer: ${model.name} + max_length: ${model.max_length} + max_gen_length: ${model.max_gen_length} + prompt_id: 0 # id in promptsource original_task=True name_l + answer_prompt: "Answer:" +optimizer: + lr: 1e-5 + beta1: 0.9 + beta2: 0.999 + wd: 0.01 +trainer: + batch: 64 # batch in total + accumulate_steps: 1 + epochs: 10 + lrscheduler: cosine + warmup_start: 1e-7 + warmup_epochs: 1 + num_workers: 1 # num_workers in total + pin_memory: True + log_interval: 100 + qualitative_num: 5 + checkpoint_dir: "./checkpoints" # path for ckps +distributed: + backend: "nccl" diff --git a/examples/multi_news/data.py b/examples/multi_news/data.py new file mode 100644 index 0000000..2f4557e --- /dev/null +++ b/examples/multi_news/data.py @@ -0,0 +1,218 @@ +from typing import Dict,Tuple,List +from torch import Tensor + +import torch +from torch.utils.data import Dataset +from omegaconf import DictConfig + +# Support GLM, BART, T5 +from transformers import AutoTokenizer +# Support commonsense_qa, multi_news +from datasets import load_dataset + +# prompt support +from promptsource.templates import DatasetTemplates + +from einops import rearrange + +class PCDataCollator: + def __init__(self,datacollator_config: DictConfig): + self.datacollator_config = datacollator_config + self.tokenizer = AutoTokenizer.from_pretrained(self.datacollator_config.tokenizer,trust_remote_code=True) + self.collator = self.build_collator() + def __call__(self, batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + return self.collator(batch) + def build_collator(self): + if "glm" in self.datacollator_config.tokenizer: + return self.glm_collator + elif "roberta" in self.datacollator_config.tokenizer: + return self.roberta_collator + elif 'bert' in self.datacollator_config.tokenizer: + return self.roberta_collator + else: + raise NotImplementedError("Not implemented yet") + def roberta_collator(self,batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + prompt_l = [] + choice_l = [] + choice_ids = [] + for item in batch: + for choice in item[1]: + prompt_l.append(item[0]) + choice_l.append(choice) + choice_ids.append(item[2]) + res = self.tokenizer(prompt_l,choice_l, + return_tensors="pt", + padding=True,truncation=True,max_length=self.datacollator_config.max_length) + for key in res: + res[key] = rearrange(res[key],'(b c) l -> b c l',b=len(batch),c=len(item[1])) + + labels = torch.tensor(choice_ids) + res['labels'] = labels + return res + + def glm_collator(self,batch:List[Tuple[str,List[str],int]]) -> Dict[str,Tensor]: + prompts,choices_l,choice_ids = zip(*batch) + prompts = self.tokenizer(prompts,return_tensors="pt", + padding=True,truncation=True,max_length=self.datacollator_config.max_length + ) + res = self.tokenizer.build_inputs_for_multiple_choice(prompts,choices_l) + labels = torch.tensor(choice_ids) + res['labels'] = labels + return res + +class PCDataset(Dataset): + def __init__(self,dataset_config: DictConfig,split:str): + self.dataset_config = dataset_config + self.dataset = load_dataset(*dataset_config.dataset.split("/"),split=split) + self.prompt_key,self.prompter = self.build_prompter() + self.adapter = self.build_adapter() + + def __len__(self): + return len(self.dataset) + def __getitem__(self, index: int) -> Tuple[str,List[str],int]: + data = self.dataset[index] + prompt,choice = self.prompter.apply(data) + choices_l = self.prompter.get_answer_choices_list(data) + choice_id = choices_l.index(choice) + prompt = prompt + "\n\n" + self.dataset_config.answer_prompt + res = self.adapter(prompt,choices_l,choice_id) + return res + def build_adapter(self): + if "glm" in self.dataset_config.tokenizer: + return self.glm_adapter + elif "roberta" in self.dataset_config.tokenizer: + return self.roberta_adapter + elif 'bert' in self.dataset_config.tokenizer: + return self.roberta_adapter + else: + raise NotImplementedError("Not implemented yet") + def roberta_adapter(self,prompt:str,choices_l:List[str],choice_id:int) -> Tuple[str,List[str],int]: + return prompt,choices_l,choice_id + def glm_adapter(self,prompt:str,choices_l:List[str],choice_id:int) -> Tuple[str,List[str],int]: + prompt += "[MASK]" + return prompt,choices_l,choice_id + def build_prompter(self): + all_prompts = DatasetTemplates(self.dataset_config.dataset) + # filter out those not original_task + prompt_key = [name for name in all_prompts.all_template_names if all_prompts[name].metadata.original_task ] + prompter = all_prompts[prompt_key[self.dataset_config.prompt_id]] + return prompt_key,prompter + +class PGDataCollator: + def __init__(self,datacollator_config: DictConfig,split:str): + self.datacollator_config = datacollator_config + self.split = split + self.tokenizer = AutoTokenizer.from_pretrained(self.datacollator_config.tokenizer,trust_remote_code=True) + self.collator = self.build_collator() + def build_collator(self): + if "glm" in self.datacollator_config.tokenizer: + if self.split == "train": + return self.glm_train_collator + else: + return self.glm_test_collator + elif "t5" in self.datacollator_config.tokenizer: + if self.split == "train": + return self.t5_train_collator + else: + return self.t5_test_collator + else: + raise NotImplementedError("Not implemented yet") + def t5_train_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + self.tokenizer.truncation_side = 'left' + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res['labels'] = self.tokenizer(answers,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt")['input_ids'] + return res + def t5_test_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + self.tokenizer.truncation_side = 'left' + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res['labels'] = [answer[len(" "):] for answer in answers] # rm the prepended + res['prompts'] = prompts + return res + def glm_train_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res = self.tokenizer.build_inputs_for_generation(res,targets=answers,max_gen_length=self.datacollator_config.max_gen_length) + return res + def glm_test_collator(self,batch: List[Tuple[str,str]]) -> Dict[str,Tensor]: + prompts,answers = [list(item) for item in zip(*batch)] + res = self.tokenizer(prompts,padding=True,truncation=True,max_length=self.datacollator_config.max_length,return_tensors="pt") + res = self.tokenizer.build_inputs_for_generation(res,max_gen_length=self.datacollator_config.max_gen_length) + res['labels'] = answers + res['prompts'] = prompts + return res + + def __call__(self, batch: List[Tuple[str,str]]) -> Dict[str, Tensor]: + return self.collator(batch) + +class PGDataset(Dataset): + def __init__(self,dataset_config:DictConfig,split:str): + """ + split = "train" or "validation" or "test" + """ + self.dataset_config = dataset_config + self.max_length = dataset_config.max_length + self.max_gen_length = dataset_config.max_gen_length + + self.dataset = load_dataset(*dataset_config.dataset.split("/"),split=split) + self.prompt_key,self.prompter = self.build_prompter() + self.answer_prompt = dataset_config.answer_prompt + self.adapter = self.build_adapter() + + + def build_adapter(self): + adapter_name = self.dataset_config.tokenizer + if "glm" in adapter_name: + adapter = self.glm_adapter + elif "t5" in adapter_name: + adapter = self.t5_adapter + elif "bart" in adapter_name: + adapter = self.bart_adapter + else: + raise NotImplementedError(f"Adapter {adapter_name} is not supported") + return adapter + + def glm_adapter(self,prompted_data:Tuple[str,str])->Tuple[str,str]: + prompt,answer = prompted_data + # add mask token + prompt += "[MASK]" + res = prompt,answer + return res + + def t5_adapter(self,prompted_data): + prompt,answer = prompted_data + # add sentinel token for prompt and answer + prompt = f'{prompt} ' + answer = f' {answer}' + return prompt,answer + return res + + def build_prompter(self): + all_prompts = DatasetTemplates(self.dataset_config.dataset) + # filter out those not original_task + prompt_key = [name for name in all_prompts.all_template_names if all_prompts[name].metadata.original_task ] + prompter = all_prompts[prompt_key[self.dataset_config.prompt_id]] + return prompt_key,prompter + + def __len__(self)->int: + return len(self.dataset) + def __getitem__(self, index:int)->Tuple[str,str]: + # TODO: format the data using prompt, add mask token based on model, padding based on max_lenght, then pass the tokenizer + data = self.dataset[index] + prompted_data = self.prompter.apply(data) + prompted_data[0] = prompted_data[0] + "\n\n" + self.answer_prompt + res = self.adapter(prompted_data) + return res + + + +if __name__ == "__main__": + dataset = load_dataset("commonsense_qa") + print(dataset.keys()) + dataset = load_dataset("multi_news") + print(dataset.keys()) + + # multi_news_prompts = DatasetTemplates("multi_news") + # print(multi_news_prompts.all_template_names) + diff --git a/examples/multi_news/env_setup.sh b/examples/multi_news/env_setup.sh new file mode 100644 index 0000000..d77a0ff --- /dev/null +++ b/examples/multi_news/env_setup.sh @@ -0,0 +1,13 @@ +# Support cuda and rocm environment +# for cuda we test on V100 and A100 +# for rocm we test on MI100, the image tested with building with pytorch-1.11.0-rocm5.1.3 + +GPU_ENV="$1" +if [ $GPU_ENV = "cuda" ] || [ $GPU_ENV = "rocm" ]; then + echo "Installing $GPU_ENV environment" + pip install -r "requirements_torch_${GPU_ENV}.txt" + pip install -r requirements.txt + pip install numpy tqdm -U +else + echo "Unsupported environment $GPU_ENV" +fi \ No newline at end of file diff --git a/examples/multi_news/logger.py b/examples/multi_news/logger.py new file mode 100644 index 0000000..659c3c6 --- /dev/null +++ b/examples/multi_news/logger.py @@ -0,0 +1,49 @@ +import os +import sys +from typing import Dict +from omegaconf import OmegaConf,DictConfig +import wandb +import loguru + +class Logger: + def __init__(self, cfg:DictConfig): + self.cfg = cfg + self.logger = loguru.logger + self.logger.remove() + self.logger.add(f"{self.cfg.jobname}.log") + + def log_rank(self,data:Dict): + log_str = "\t".join(f"{key} {data[key]}" for key in data) + self.logger.info(log_str) + + def log_master(self,data:Dict,if_wandb:bool=True): + """ + only called in master process + """ + log_str = "\t".join(f"{key} {data[key]}" for key in data) + self.logger.info(log_str) + if self.cfg.debug != True and if_wandb: + wandb.log(data) + + def login(self): + """ + only login once in distributed training + """ + self.logger.add(sys.stderr) + # if you do not want to use wandb, you can set self.cfg.debug = True + if self.cfg.debug != True: + # you can use export VAR=VALUE to set environment variables before running the script + wandb_var_l = ["WANDB_API_KEY","WANDB_ENTITY","WANDB_PROJECT"] + for wandb_var in wandb_var_l: + if os.environ.get(wandb_var) is None: + os.environ[wandb_var] = input(f"Please input your {wandb_var}:") + wandb.login(key=os.environ['WANDB_API_KEY']) + wandb.init(project=os.environ['WANDB_PROJECT'], entity=os.environ['WANDB_ENTITY'], + name=self.cfg.jobname,config=OmegaConf.to_container(self.cfg,resolve=True)) + # wandb.config.update(self.cfg) + + self.logger.info("\nConfigs are:\n" + OmegaConf.to_yaml(self.cfg)) + +if __name__ == "__main__": + mylogger = Logger() + mylogger.log({"loss":0.1,"acc":0.9}) diff --git a/examples/multi_news/lrscheduler.py b/examples/multi_news/lrscheduler.py new file mode 100644 index 0000000..d41ba00 --- /dev/null +++ b/examples/multi_news/lrscheduler.py @@ -0,0 +1,17 @@ +from omegaconf import DictConfig +from torch.optim.lr_scheduler import CosineAnnealingLR +from ignite.handlers import create_lr_scheduler_with_warmup + +def build_lrscheduler(optimizer, trainer_cfg:DictConfig,steps_per_epoch:int): + if trainer_cfg.lrscheduler == "cosine": + lrscheduler = CosineAnnealingLR( + optimizer, T_max=(trainer_cfg.epochs-trainer_cfg.warmup_epochs)*steps_per_epoch, + eta_min=trainer_cfg.warmup_start # set annealing end lr to warmup start lr + ) + lrscheduler = create_lr_scheduler_with_warmup( + lrscheduler,warmup_start_value=trainer_cfg.warmup_start, + warmup_duration=trainer_cfg.warmup_epochs*steps_per_epoch + ) + else: + raise NotImplementedError + return lrscheduler diff --git a/examples/multi_news/main.py b/examples/multi_news/main.py new file mode 100644 index 0000000..fec5f98 --- /dev/null +++ b/examples/multi_news/main.py @@ -0,0 +1,237 @@ +import warnings +import hydra +from omegaconf import DictConfig + +from logger import Logger +from data import PGDataset,PGDataCollator,PCDataset,PCDataCollator +from model import PGModel, PCModel + +import torch +from torch import optim + +from typing import Tuple,List +from torch import Tensor + +import ignite.distributed as idist +from ignite.engine import Engine,Events +from ignite import metrics +from lrscheduler import build_lrscheduler +from ignite.handlers import Checkpoint, global_step_from_engine + +def main_engine(local_rank: int, cfg: DictConfig,**kwargs): + # ignore warnings + if not cfg.debug: + warnings.filterwarnings("ignore") + # Setup logger + logger = Logger(cfg) + if idist.get_rank() == 0: + logger.login() + logger.log_rank({"rank":idist.get_rank(),"local_rank":local_rank,"world_size":idist.get_world_size()}) + + # Setup model + if cfg.task == "pg": + model = PGModel(cfg.model) + elif cfg.task == "pc": + model = PCModel(cfg.model) + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + model = idist.auto_model(model) + # Setup optimizer + optimizer = optim.AdamW(model.parameters(), + lr=cfg.optimizer.lr,betas=(cfg.optimizer.beta1,cfg.optimizer.beta2), + weight_decay=cfg.optimizer.wd, + ) + optimizer = idist.auto_optim(optimizer) + + def train_step(engine:Engine, batch:Tensor) -> Tensor: + model.train() + batch.to(idist.device()) + loss = model(batch).loss + loss = loss / cfg.trainer.accumulate_steps # accumulate gradients + loss.backward() + if engine.state.iteration % cfg.trainer.accumulate_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + + def pg_test_evaluate_step(engine:Engine, batch:Tensor) -> Tuple[List[str],List[str]]: + model.eval() + prompts = batch.pop("prompts") + labels = batch.pop("labels") + with torch.no_grad(): + batch.to(idist.device()) + if idist.get_world_size() > 1: + res = model.module.generate(batch) + else: + res = model.generate(batch) + return res, labels + + def pc_test_evaluate_step(engine:Engine, batch:Tensor) -> Tuple[List[str],List[str]]: + model.eval() + with torch.no_grad(): + batch.to(idist.device()) + labels = batch.pop("labels") + res = model(batch).logits + return res, labels + + # setup engines + trainer = Engine(train_step) + if cfg.task == "pg": + test_evaluate_step = pg_test_evaluate_step + elif cfg.task == "pc": + test_evaluate_step = pc_test_evaluate_step + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + test_evaluator = Engine(test_evaluate_step) + + # metrics + train_loss = metrics.Average() + train_loss.attach(trainer,"train_loss") + if cfg.task == "pg": + def rouge_output_transform(output:Tuple[List[str],List[str]]) -> Tuple[List[List[str]],List[List[List[str]]]]: + res,labels = output + res = [item.split() for item in res] + labels = [[item.split()] for item in labels] + return res,labels + test_rouge = metrics.Rouge(variants=['L',1,2],output_transform=rouge_output_transform) + test_rouge.attach(test_evaluator,"test_rouge") + elif cfg.task == "pc": + test_acc = metrics.Accuracy() + test_acc.attach(test_evaluator,"test_acc") + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + + @trainer.on(Events.COMPLETED) + def log_final_results(engine:Engine): + log_data = { "epoch":engine.state.epoch, } + # test evaluation of rouge is too slow, so we only evaluate it at the end + test_evaluator.run(test_dataloader) + test_metrics = test_evaluator.state.metrics + if cfg.task == "pg": + log_data["test_rouge"] = test_metrics["test_rouge"] + elif cfg.task == "pc": + log_data["test_acc"] = test_metrics["test_acc"] + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + # log test evaluation + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + return log_data + + # @trainer.on(Events.EPOCH_STARTED) # for debug + @trainer.on(Events.EPOCH_COMPLETED) + def log_epoch_results(engine:Engine): + log_data = { "epoch":engine.state.epoch, } + # train evaluation + train_metrics = engine.state.metrics + log_data["train_loss"] = train_metrics["train_loss"] + + model.eval() + if cfg.task == "pg": + # qualitative study first + qualitative_num = cfg.trainer.qualitative_num + qualitative_log_data = {} + qualitative_rouge = metrics.Rouge() # calculate rouge for qualitative study + with torch.no_grad(): + for batch in test_dataloader: + prompts = batch.pop("prompts") + labels = batch.pop("labels") + batch.to(idist.device()) + if idist.get_world_size() > 1: + res = model.module.generate(batch) + else: + res = model.generate(batch) + res = res[:qualitative_num] + labels = labels[:qualitative_num] + # calculcate rouge for qulitative study examples + qualitative_rouge.update(([item.split() for item in res],[[item.split()] for item in labels])) + qualitative_log_data["qualitative_rouge"] = qualitative_rouge.compute() + # log qualitative study examples + for idx,(prompt,model_res,label) in enumerate(zip(prompts,res,labels)): + qualitative_log_data[f"qualitative_{idx}"] = { + "prompt":prompt, + "model_res":model_res, + "label":label, + } + break + # log qualitative study + if idist.get_rank() == 0: + logger.log_master(qualitative_log_data,if_wandb=False) + elif cfg.task == "pc": + # test accuracy + test_evaluator.run(test_dataloader) + acc_metrics = test_evaluator.state.metrics + log_data['test_acc'] = acc_metrics['test_acc'] + + # log train evaluation + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + return log_data + + @trainer.on(Events.ITERATION_COMPLETED(every=cfg.trainer.log_interval)) + def log_step_results(engine:Engine): + log_data = { + "iteration":engine.state.iteration, + "loss_per_step":engine.state.output, + "lr":optimizer.param_groups[0]["lr"], + } + logger.log_rank(log_data) + if idist.get_rank() == 0: + logger.log_master(log_data) + + if cfg.task=="pg": + train_data_collator = PGDataCollator(cfg.data,"train") + train_dataset = PGDataset(cfg.data,split="train") + elif cfg.task=="pc": + train_data_collator = PCDataCollator(cfg.data) + train_dataset = PCDataset(cfg.data,split="train") + else: + raise NotImplementedError(f"Task {cfg.task} is not supported") + if idist.get_rank() == 0: + logger.log_master({ + "train dataset prompt_key":f"{train_dataset.prompt_key}" + },if_wandb=False) + + train_dataloader = idist.auto_dataloader( + train_dataset,batch_size=cfg.trainer.batch, + num_workers=cfg.trainer.num_workers, + pin_memory=cfg.trainer.pin_memory, + collate_fn=train_data_collator, + shuffle=True,drop_last=True, + ) + if cfg.task=="pg": + test_data_collator = PGDataCollator(cfg.data,"test") + test_dataset = PGDataset(cfg.data,split="test") + elif cfg.task=="pc": + test_data_collator = PCDataCollator(cfg.data) + test_dataset = PCDataset(cfg.data,split="validation") + test_dataloader = idist.auto_dataloader( + test_dataset,batch_size=cfg.trainer.batch, + num_workers=cfg.trainer.num_workers, + pin_memory=cfg.trainer.pin_memory, + collate_fn=test_data_collator, + shuffle=False,drop_last=False, + ) + + lrscheduler = build_lrscheduler(optimizer,cfg.trainer,len(train_dataset)//cfg.trainer.batch) + trainer.add_event_handler(Events.ITERATION_STARTED,lrscheduler) + + # checkpointing; distributed is automatically handled + to_save = {"model":model,"optimizer":optimizer,"trainer":trainer} + checkpoint_dir = f"{cfg.trainer.checkpoint_dir}/{cfg.jobname}" + checkpoint = Checkpoint(to_save,checkpoint_dir)#,global_step_from_engine(trainer)) + test_evaluator.add_event_handler(Events.STARTED,checkpoint) + + trainer.run(train_dataloader,max_epochs=cfg.trainer.epochs) + +@hydra.main(version_base=None,config_path="config", config_name="basic") +def main(cfg: DictConfig) -> None: + backend = cfg.distributed.backend + with idist.Parallel(backend=backend) as parallel: + parallel.run(main_engine, cfg) + return + +if __name__ == '__main__': + main() diff --git a/examples/multi_news/model.py b/examples/multi_news/model.py new file mode 100644 index 0000000..6f54ada --- /dev/null +++ b/examples/multi_news/model.py @@ -0,0 +1,66 @@ +import re + +from torch import nn +from transformers import AutoModelForSeq2SeqLM,AutoTokenizer,AutoModelForMultipleChoice + +from typing import Dict,List +from omegaconf import DictConfig +from torch import Tensor + +class PCModel(nn.Module): + """ + Prompted Choice Model + """ + def __init__(self,model_config:DictConfig) -> None: + super(PCModel,self).__init__() + self.model_config = model_config + self.model = AutoModelForMultipleChoice.from_pretrained(model_config.name,trust_remote_code=True) + + def forward(self,data:Dict[str,Tensor]): + res = self.model(**data) + return res + +class PGModel(nn.Module): + """ + Prompted Generation Model + """ + def __init__(self,model_config:DictConfig): + super(PGModel,self).__init__() + self.model_config = model_config + self.model = AutoModelForSeq2SeqLM.from_pretrained(model_config.name,trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(model_config.name,trust_remote_code=True) + self.generator = self.build_generator() + def forward(self,data:Dict[str,Tensor]): + res = self.model(**data) + return res + def generate(self,data:Dict[str,Tensor],**kwargs)->List[str]: + return self.generator(data,**kwargs) + def t5_generator(self,data:Dict[str,Tensor],**kwargs)->List[str]: + res = self.model.generate(**data, + max_length=self.model_config.max_gen_length, **kwargs) + res = self.tokenizer.batch_decode(res.tolist()) + pattern = r"(|)*(.*?)(|\Z|)" + res = [re.search(pattern,txt,re.DOTALL).group(2).strip() for txt in res] + return res + def glm_generator(self,data:Dict[str,Tensor],**kwargs)->List[str]: + res = self.model.generate(**data, + max_new_tokens=self.model_config.max_gen_length, + eos_token_id=self.tokenizer.eop_token_id, + pad_token_id=self.tokenizer.pad_token_id, + **kwargs) + res = self.tokenizer.batch_decode(res.tolist()) + pattern = r"<\|startofpiece\|>(.*?)(\Z|<\|endofpiece\|>)" + res = [re.search(pattern,txt,re.DOTALL).group(1).strip() for txt in res] + return res + + def build_generator(self): + if "glm" in self.model_config.name: + return self.glm_generator + elif "t5" in self.model_config.name: + return self.t5_generator + else: + raise NotImplementedError("Not implemented yet") + + +if __name__ == "__main__": + pass diff --git a/examples/multi_news/requirements.txt b/examples/multi_news/requirements.txt new file mode 100644 index 0000000..38fc6db --- /dev/null +++ b/examples/multi_news/requirements.txt @@ -0,0 +1,13 @@ +transformers +datasets +sentencepiece +pytorch-ignite + +promptsource + +einops + +wandb +loguru +hydra-core +omegaconf \ No newline at end of file diff --git a/examples/multi_news/requirements_torch_cuda.txt b/examples/multi_news/requirements_torch_cuda.txt new file mode 100644 index 0000000..9b19b8e --- /dev/null +++ b/examples/multi_news/requirements_torch_cuda.txt @@ -0,0 +1,4 @@ +-f https://download.pytorch.org/whl/torch_stable.html +torch==1.11.0+cu113 +torchvision==0.12.0+cu113 +torchaudio==0.11.0 \ No newline at end of file diff --git a/examples/multi_news/requirements_torch_rocm.txt b/examples/multi_news/requirements_torch_rocm.txt new file mode 100644 index 0000000..d2b7cc2 --- /dev/null +++ b/examples/multi_news/requirements_torch_rocm.txt @@ -0,0 +1,6 @@ +# actually using amlt-sing rocm5.1.3, with torch1.11.0 installed already + +# -f https://download.pytorch.org/whl/rocm4.5.2 +# torch==1.11.0+rocm4.5.2 +# torchvision==0.12.0+rocm4.5.2 +# torchaudio==0.11.0 \ No newline at end of file