From 07241edd9bcf3904d26632912d604721750513e1 Mon Sep 17 00:00:00 2001
From: Naufal Suryanto <51362638+naufalso@users.noreply.github.com>
Date: Wed, 26 Apr 2023 10:16:11 +0900
Subject: [PATCH] Fix custom token in train.py

After the LLaMA model finetuning using the existing training code, I realized that the model never outputs the EOS token, which causes the generation never stop until max_new_token is reached.

I tried to debug the code and found that `tokenizer.eos_token`, `tokenizer.bos_token`, and `tokenizer.unk_token` are all `'' (empty string).`

Since `'' (empty string)` is not equal to `None`, the custom tokens in the training code will not be added. So I would  suggest fixing using the current code changes.

I have tested that after the training using the modified code, the model can output EOS token correctly.
---
 train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index 2b5a98bd..6e5d20d0 100644
--- a/train.py
+++ b/train.py
@@ -196,13 +196,13 @@ def train():
         use_fast=False,
     )
     special_tokens_dict = dict()
-    if tokenizer.pad_token is None:
+    if not tokenizer.pad_token:
         special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
-    if tokenizer.eos_token is None:
+    if not tokenizer.eos_token:
         special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
-    if tokenizer.bos_token is None:
+    if not tokenizer.bos_token:
         special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
-    if tokenizer.unk_token is None:
+    if not tokenizer.unk_token:
         special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
 
     smart_tokenizer_and_embedding_resize(