-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
975 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# pre-process | ||
rm -rf data/corpus/namuwiki/samples/ | ||
mkdir -p data/corpus/namuwiki/samples/mecab/ | ||
|
||
# run | ||
python train_tokenizer.py \ | ||
--cfg=./cfgs/pipelines/word_piece_with_morpheme.yaml | ||
|
||
python train_tokenizer.py \ | ||
--cfg=./cfgs/pipelines/word_piece.yaml | ||
|
||
python train_tokenizer.py \ | ||
--cfg=./cfgs/pipelines/cbpe.yaml | ||
|
||
python train_tokenizer.py \ | ||
--cfg=./cfgs/pipelines/bbpe.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
from tokenizers import ( | ||
Tokenizer, | ||
AddedToken, | ||
pre_tokenizers, | ||
decoders, | ||
trainers, | ||
processors, | ||
) | ||
from tokenizers.models import BPE | ||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence | ||
from .base_tokenizer import BaseTokenizer | ||
|
||
from typing import Optional, List, Union, Dict, Tuple, Iterator | ||
|
||
|
||
class ByteLevelBPETokenizer(BaseTokenizer): | ||
"""ByteLevelBPETokenizer | ||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model | ||
""" | ||
|
||
def __init__( | ||
self, | ||
vocab: Optional[Union[str, Dict[str, int]]] = None, | ||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, | ||
add_prefix_space: bool = False, | ||
lowercase: bool = False, | ||
dropout: Optional[float] = None, | ||
unicode_normalizer: Optional[str] = None, | ||
continuing_subword_prefix: Optional[str] = None, | ||
end_of_word_suffix: Optional[str] = None, | ||
trim_offsets: bool = False, | ||
): | ||
if vocab is not None and merges is not None: | ||
tokenizer = Tokenizer( | ||
BPE( | ||
vocab, | ||
merges, | ||
dropout=dropout, | ||
continuing_subword_prefix=continuing_subword_prefix or "", | ||
end_of_word_suffix=end_of_word_suffix or "", | ||
) | ||
) | ||
else: | ||
tokenizer = Tokenizer(BPE()) | ||
|
||
# Check for Unicode normalization first (before everything else) | ||
normalizers = [] | ||
|
||
if unicode_normalizer: | ||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)] | ||
|
||
if lowercase: | ||
normalizers += [Lowercase()] | ||
|
||
# Create the normalizer structure | ||
if len(normalizers) > 0: | ||
if len(normalizers) > 1: | ||
tokenizer.normalizer = Sequence(normalizers) | ||
else: | ||
tokenizer.normalizer = normalizers[0] | ||
|
||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space) | ||
tokenizer.decoder = decoders.ByteLevel() | ||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets) | ||
|
||
parameters = { | ||
"model": "ByteLevelBPE", | ||
"add_prefix_space": add_prefix_space, | ||
"lowercase": lowercase, | ||
"dropout": dropout, | ||
"unicode_normalizer": unicode_normalizer, | ||
"continuing_subword_prefix": continuing_subword_prefix, | ||
"end_of_word_suffix": end_of_word_suffix, | ||
"trim_offsets": trim_offsets, | ||
} | ||
|
||
super().__init__(tokenizer, parameters) | ||
|
||
@staticmethod | ||
def from_file(vocab_filename: str, merges_filename: str, **kwargs): | ||
vocab, merges = BPE.read_file(vocab_filename, merges_filename) | ||
return ByteLevelBPETokenizer(vocab, merges, **kwargs) | ||
|
||
def train( | ||
self, | ||
files: Union[str, List[str]], | ||
vocab_size: int = 30000, | ||
min_frequency: int = 2, | ||
show_progress: bool = True, | ||
special_tokens: List[Union[str, AddedToken]] = [], | ||
): | ||
""" Train the model using the given files """ | ||
|
||
trainer = trainers.BpeTrainer( | ||
vocab_size=vocab_size, | ||
min_frequency=min_frequency, | ||
show_progress=show_progress, | ||
special_tokens=special_tokens, | ||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), | ||
) | ||
if isinstance(files, str): | ||
files = [files] | ||
self._tokenizer.train(files, trainer=trainer) | ||
|
||
def train_from_iterator( | ||
self, | ||
iterator: Union[Iterator[str], Iterator[Iterator[str]]], | ||
vocab_size: int = 30000, | ||
min_frequency: int = 2, | ||
show_progress: bool = True, | ||
special_tokens: List[Union[str, AddedToken]] = [], | ||
): | ||
""" Train the model using the given iterator """ | ||
|
||
trainer = trainers.BpeTrainer( | ||
vocab_size=vocab_size, | ||
min_frequency=min_frequency, | ||
show_progress=show_progress, | ||
special_tokens=special_tokens, | ||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), | ||
) | ||
self._tokenizer.train_from_iterator(iterator, trainer=trainer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers | ||
from tokenizers.models import BPE | ||
from tokenizers.normalizers import ( | ||
Sequence, | ||
Lowercase, | ||
unicode_normalizer_from_str, | ||
BertNormalizer, | ||
) | ||
from .base_tokenizer import BaseTokenizer | ||
|
||
from typing import Optional, List, Union, Dict, Tuple, Iterator | ||
|
||
|
||
class CharBPETokenizer(BaseTokenizer): | ||
"""Original BPE Tokenizer | ||
Represents the BPE algorithm, as introduced by Rico Sennrich | ||
(https://arxiv.org/abs/1508.07909) | ||
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original | ||
Sennrich subword-nmt implementation by the following options that you can deactivate: | ||
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by: | ||
* removing any control characters and replacing all whitespaces by the classic one. | ||
* handle chinese chars by putting spaces around them. | ||
* strip all accents. | ||
- spitting on punctuation in addition to whitespaces (deactivate it with | ||
`split_on_whitespace_only=True`) | ||
""" | ||
|
||
def __init__( | ||
self, | ||
vocab: Optional[Union[str, Dict[str, int]]] = None, | ||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, | ||
unk_token: Union[str, AddedToken] = "<unk>", | ||
suffix: str = "</w>", | ||
dropout: Optional[float] = None, | ||
lowercase: bool = False, | ||
unicode_normalizer: Optional[str] = None, | ||
bert_normalizer: bool = True, | ||
split_on_whitespace_only: bool = False, | ||
): | ||
if vocab is not None and merges is not None: | ||
tokenizer = Tokenizer( | ||
BPE( | ||
vocab, | ||
merges, | ||
dropout=dropout, | ||
unk_token=str(unk_token), | ||
end_of_word_suffix=suffix, | ||
) | ||
) | ||
else: | ||
tokenizer = Tokenizer(BPE()) | ||
|
||
if tokenizer.token_to_id(str(unk_token)) is not None: | ||
tokenizer.add_special_tokens([str(unk_token)]) | ||
|
||
# Check for Unicode normalization first (before everything else) | ||
normalizers = [] | ||
|
||
if unicode_normalizer: | ||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)] | ||
|
||
if bert_normalizer: | ||
normalizers += [BertNormalizer(lowercase=False)] | ||
|
||
if lowercase: | ||
normalizers += [Lowercase()] | ||
|
||
# Create the normalizer structure | ||
if len(normalizers) > 0: | ||
if len(normalizers) > 1: | ||
tokenizer.normalizer = Sequence(normalizers) | ||
else: | ||
tokenizer.normalizer = normalizers[0] | ||
|
||
if split_on_whitespace_only: | ||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() | ||
else: | ||
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() | ||
|
||
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix) | ||
|
||
parameters = { | ||
"model": "BPE", | ||
"unk_token": unk_token, | ||
"suffix": suffix, | ||
"dropout": dropout, | ||
"lowercase": lowercase, | ||
"unicode_normalizer": unicode_normalizer, | ||
"bert_normalizer": bert_normalizer, | ||
"split_on_whitespace_only": split_on_whitespace_only, | ||
} | ||
|
||
super().__init__(tokenizer, parameters) | ||
|
||
@staticmethod | ||
def from_file(vocab_filename: str, merges_filename: str, **kwargs): | ||
vocab, merges = BPE.read_file(vocab_filename, merges_filename) | ||
return CharBPETokenizer(vocab, merges, **kwargs) | ||
|
||
def train( | ||
self, | ||
files: Union[str, List[str]], | ||
vocab_size: int = 30000, | ||
min_frequency: int = 2, | ||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"], | ||
limit_alphabet: int = 1000, | ||
initial_alphabet: List[str] = [], | ||
suffix: Optional[str] = "</w>", | ||
show_progress: bool = True, | ||
): | ||
""" Train the model using the given files """ | ||
|
||
trainer = trainers.BpeTrainer( | ||
vocab_size=vocab_size, | ||
min_frequency=min_frequency, | ||
special_tokens=special_tokens, | ||
limit_alphabet=limit_alphabet, | ||
initial_alphabet=initial_alphabet, | ||
end_of_word_suffix=suffix, | ||
show_progress=show_progress, | ||
) | ||
if isinstance(files, str): | ||
files = [files] | ||
self._tokenizer.train(files, trainer=trainer) | ||
|
||
def train_from_iterator( | ||
self, | ||
iterator: Union[Iterator[str], Iterator[Iterator[str]]], | ||
vocab_size: int = 30000, | ||
min_frequency: int = 2, | ||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"], | ||
limit_alphabet: int = 1000, | ||
initial_alphabet: List[str] = [], | ||
suffix: Optional[str] = "</w>", | ||
show_progress: bool = True, | ||
): | ||
""" Train the model using the given iterator """ | ||
|
||
trainer = trainers.BpeTrainer( | ||
vocab_size=vocab_size, | ||
min_frequency=min_frequency, | ||
special_tokens=special_tokens, | ||
limit_alphabet=limit_alphabet, | ||
initial_alphabet=initial_alphabet, | ||
end_of_word_suffix=suffix, | ||
show_progress=show_progress, | ||
) | ||
self._tokenizer.train_from_iterator(iterator, trainer=trainer) |
Oops, something went wrong.