From 07241edd9bcf3904d26632912d604721750513e1 Mon Sep 17 00:00:00 2001 From: Naufal Suryanto <51362638+naufalso@users.noreply.github.com> Date: Wed, 26 Apr 2023 10:16:11 +0900 Subject: [PATCH] Fix custom token in train.py After the LLaMA model finetuning using the existing training code, I realized that the model never outputs the EOS token, which causes the generation never stop until max_new_token is reached. I tried to debug the code and found that `tokenizer.eos_token`, `tokenizer.bos_token`, and `tokenizer.unk_token` are all `'' (empty string).` Since `'' (empty string)` is not equal to `None`, the custom tokens in the training code will not be added. So I would suggest fixing using the current code changes. I have tested that after the training using the modified code, the model can output EOS token correctly. --- train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 2b5a98bd..6e5d20d0 100644 --- a/train.py +++ b/train.py @@ -196,13 +196,13 @@ def train(): use_fast=False, ) special_tokens_dict = dict() - if tokenizer.pad_token is None: + if not tokenizer.pad_token: special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN - if tokenizer.eos_token is None: + if not tokenizer.eos_token: special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN - if tokenizer.bos_token is None: + if not tokenizer.bos_token: special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN - if tokenizer.unk_token is None: + if not tokenizer.unk_token: special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN smart_tokenizer_and_embedding_resize(