generated from HephaestusProject/template
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_vocabulary.py
40 lines (30 loc) · 1.2 KB
/
build_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
Usage:
main.py build-vocabulary [options]
main.py build-vocabulary (-h | --help)
Options:
--data-path <data-path> [type: path]
--word-vocabulary-path <word-vocabulary-path> [type: path]
--char-vocabulary-path <char-vocabulary-path> [type: path]
-h --help Show this.
"""
from pathlib import Path
from dataset import CHAR_SPECIAL_TOKENS, WORD_SPECIAL_TOKENS, CharCorpusDataset
from tokenizers.char_tokenizer import CharTokenizer
from tokenizers.word_tokenizer import WordTokenizer
def build_vocabulary(args: dict):
word_tokenizer = WordTokenizer.build_from_generator(
sentences=generate_sentences(args["--data-path"]),
special_tokens=WORD_SPECIAL_TOKENS,
)
char_tokenizer = CharTokenizer.build_from_generator(
sentences=generate_sentences(args["--data-path"]),
special_tokens=CHAR_SPECIAL_TOKENS,
)
word_tokenizer.save(vocabulary_path=args["--word-vocabulary-path"])
char_tokenizer.save(vocabulary_path=args["--char-vocabulary-path"])
def generate_sentences(data_path: Path):
with data_path.open() as data_file:
for line in data_file:
line = CharCorpusDataset.normalize_line(line)
yield line