Skip to content

Commit 7ef209b

Browse files
authored
Merge pull request #584 from sillsdev/issue365
Solve Issue #365 and #390
2 parents 31cb9b7 + 6daa28e commit 7ef209b

File tree

2 files changed

+4
-3
lines changed

2 files changed

+4
-3
lines changed

silnlp/nmt/config.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,8 @@ def preprocess(self, stats: bool, force_align: bool = False) -> None:
493493
LOGGER.error(f"The source file {str(file)} does not exist.")
494494
return
495495

496-
self._build_vocabs(stats)
496+
if self.data["tokenize"]:
497+
self._build_vocabs(stats)
497498
tokenizer = self.create_tokenizer()
498499
self._build_corpora(tokenizer, stats, force_align)
499500
LOGGER.info("Preprocessing completed")
@@ -557,7 +558,7 @@ def _build_corpora(self, tokenizer: Tokenizer, stats: bool, force_align: bool) -
557558
dict_count = self._write_dictionary(tokenizer, src_terms_files, trg_terms_files)
558559
LOGGER.info(f"dictionary size: {dict_count}")
559560

560-
if stats:
561+
if stats and self.data["tokenize"]:
561562
self._calculate_tokenization_stats()
562563

563564
return train_count

silnlp/nmt/hugging_face_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def _build_vocabs(self, stats: bool = False) -> None:
540540
["Target", 0],
541541
]
542542

543-
if stats:
543+
if stats and self.data["tokenize"]:
544544
stats_columns = pd.MultiIndex.from_tuples(
545545
[
546546
(" ", "Translation Side"),

0 commit comments

Comments
 (0)