Skip to content

Commit

Permalink
feat: Add test code for config and sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
soeque1 committed Jan 31, 2021
1 parent cf54c57 commit 2acc995
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 66 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# vscode
.vscode

# data
./data

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
tokenizers
konlpy
PyYAML
PyYAML
pytest
40 changes: 40 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from pathlib import Path
import pytest
from tools.config import cfg_from_yaml_file


@pytest.fixture(scope="session", autouse=True)
def setup_and_teardown_package():
test_dir = Path.cwd() / "tests/resources/"
list_of_namuwiki_filepath = [test_dir / "namuwiki.01.txt", test_dir / "namuwiki.02.txt"]
list_of_tests_samples = ["[목차]== 개요 ==BEMANI 시리즈의 악곡. \n",
"보컬은 코사카 리유, Noria. \n",
"롱버젼이 앨범 \"BeForU\"에 수록되었다. \n",
"== 팝픈뮤직 == * 곡 목록으로 돌아가기팝픈뮤직 9에 처음 수록되었다. \n",
"EX채보는 전체적으로 8비트 세로연타+동시치기 위주. \n",
"후반부에 밀도가 조금 높아지므로 주의. \n",
"17 무비 때 삭제되었다가 19 튠스트릿에서 다른 BeForU 멤버의 곡들과 함께 부활했다. \n",
"=== 아티스트 코멘트 ===||제 안에서 BRE∀K DOWN!에 이은 걸즈 락 노선제 2탄이 ☆shining☆인 겁니다. \n",
"이번에는 BeForU로부터 코사카 리유와 시리아시 노리아 두 사람을 유닛화해서,로서 전면적으로 기용했습니다. \n",
"shining이라는 단어는, 제가 고등학생 시절부터제 자신을 던졌던 단어로, 살아가면서잊을 리가 없는, 외상()의 마음입니다."]

for namuwiki_filepath in list_of_namuwiki_filepath:
with open(namuwiki_filepath, mode="w", encoding="utf-8") as io:
for sample in list_of_tests_samples:
io.write(sample)

yield

for namuwiki_filepath in list_of_namuwiki_filepath:
os.remove(namuwiki_filepath)


@pytest.fixture(scope="package")
def cfg():
yaml_path = Path.cwd() / "cfgs/pipelines/word_piece_with_morpheme.yaml"
cfg = cfg_from_yaml_file(yaml_path)
cfg['Path']['data-path'] = Path.cwd() / "tests/resources/namuwiki.*.txt"
cfg['Path']['save-path'] = Path.cwd() / "tests/resources/samples/"
cfg['Samples']['rate'] = 0.01
return cfg
Empty file added tests/resources/__init__.py
Empty file.
Empty file.
Empty file.
9 changes: 9 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from tools.config import cfg_from_yaml_file


def test_configs(cfg):
assert list(cfg.keys()) == ['Samples', 'Morpheme-aware', 'Path', 'Pipelines']
assert list(cfg.get('Samples').keys()) == ['rate']
assert cfg.get('Morpheme-aware') == True
assert list(cfg.get('Path').keys()) == ['data-path', 'save-path']
assert list(cfg.get('Pipelines').keys()) == ['Tokenizer', 'normalizer', 'pre_tokenizer', 'decoder']
15 changes: 15 additions & 0 deletions tests/test_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import glob
import os
from train_tokenizer import sampling, morphme

def test_samplings(cfg):
def tear_down(cfg):
for i in glob.glob(str(cfg['Path']['save-path']) + '/**/*.txt', recursive=True):
os.remove(i)

sampling(data_path=cfg['Path']['data-path'], sample_rate=cfg['Samples']['rate'], save_path='/samples/')

if cfg['Morpheme-aware']:
morphme(data_path=cfg['Path']['save-path'], save_path='/mecab/')

tear_down(cfg)
45 changes: 45 additions & 0 deletions tools/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import yaml

# tokenizer-pre_tokenizers
from tokenizers.pre_tokenizers import (
PreTokenizer,
ByteLevel,
Whitespace,
WhitespaceSplit,
BertPreTokenizer,
Metaspace,
CharDelimiterSplit,
Punctuation,
Sequence,
Digits,
UnicodeScripts,
Split,
)

# tokenizer-normalizers
from tokenizers.normalizers import NFKC, Lowercase

# tokenizers
from tools.word_piece import WordPieceTokenizer
from tools.BBPE import ByteLevelBPETokenizer
from tools.CBPE import CharBPETokenizer


def cfg_from_yaml_file(cfg_file):
def check_and_evalfunc(config):
config['Pipelines']['Tokenizer'] = eval(config['Pipelines']['Tokenizer'])
config['Pipelines']['normalizer'] = [eval(i) for i in config['Pipelines']['normalizer']]
config['Pipelines']['pre_tokenizer'] = eval(config['Pipelines']['pre_tokenizer'])
config['Pipelines']['decoder'] = eval(config['Pipelines']['decoder'])
return config

with open(cfg_file, 'r') as f:
try:
config = yaml.load(f, Loader=yaml.FullLoader)
except:
config = yaml.load(f)

config = check_and_evalfunc(config)
print(config)

return config
15 changes: 0 additions & 15 deletions tools/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,6 @@
# CharBPETokenizer,
# )

from tokenizers.pre_tokenizers import (
PreTokenizer,
ByteLevel,
Whitespace,
WhitespaceSplit,
BertPreTokenizer,
Metaspace,
CharDelimiterSplit,
Punctuation,
Sequence,
Digits,
UnicodeScripts,
Split,
)

from tools.word_piece import WordPieceTokenizer
from tools.BBPE import ByteLevelBPETokenizer
from tools.CBPE import CharBPETokenizer
Expand Down
67 changes: 17 additions & 50 deletions train_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,10 @@
import os
import re
import glob
import yaml

# tokenizer-pre_tokenizers
from tokenizers.pre_tokenizers import (
PreTokenizer,
ByteLevel,
Whitespace,
WhitespaceSplit,
BertPreTokenizer,
Metaspace,
CharDelimiterSplit,
Punctuation,
Sequence,
Digits,
UnicodeScripts,
Split,
)

# tokenizer-normalizers
# configs
from tools.config import cfg_from_yaml_file
from tokenizers import normalizers
from tokenizers.normalizers import NFKC, Lowercase

# tokenizers
from tools.word_piece import WordPieceTokenizer
from tools.BBPE import ByteLevelBPETokenizer
from tools.CBPE import CharBPETokenizer

# utils
from tools.utils import (
Expand All @@ -36,44 +14,33 @@
)


def sampling(data_path: str, sample_rate: float, save_path: str = '/samples/') -> None:
files = glob.glob(str(data_path))
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), save_path, os.path.basename(i)]) for i in files]}
params.update({'sample_rate': sample_rate})
multiprocessing_with_async(params, func=preprocess_shuf_pool)

def cfg_from_yaml_file(cfg_file):
def check_and_evalfunc(config):
config['Pipelines']['Tokenizer'] = eval(config['Pipelines']['Tokenizer'])
config['Pipelines']['normalizer'] = [eval(i) for i in config['Pipelines']['normalizer']]
config['Pipelines']['pre_tokenizer'] = eval(config['Pipelines']['pre_tokenizer'])
config['Pipelines']['decoder'] = eval(config['Pipelines']['decoder'])
return config

with open(cfg_file, 'r') as f:
try:
config = yaml.load(f, Loader=yaml.FullLoader)
except:
config = yaml.load(f)

config = check_and_evalfunc(config)

return config
def morphme(data_path: str, save_path: str = '/mecab/') -> None:
files = glob.glob(str(data_path) + '/*.txt')
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), save_path, os.path.basename(i)]) for i in files]}
multiprocessing_with_async(params, func=preprocess_mecab_pool)
return str(data_path) + str(save_path)


def main(cfg):
config = cfg_from_yaml_file(cfg)
print(config)

# Sampling
files = glob.glob(config['Path']['data-path'])
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), '/samples/', os.path.basename(i)]) for i in files]}
params.update({'sample_rate': config['Samples']['rate']})
multiprocessing_with_async(params, func=preprocess_shuf_pool)
sampling(data_path=config['Path']['data-path'], sample_rate=config['Samples']['rate'], save_path='/samples/')

# Morphme
if config['Morpheme-aware']:
files = glob.glob(config['Path']['save-path'] + '/*.txt')
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), '/mecab/', os.path.basename(i)]) for i in files]}
multiprocessing_with_async(params, func=preprocess_mecab_pool)
texts = glob.glob(config['Path']['save-path'] + '/mecab/*.txt')
save_path = morphme(data_path=config['Path']['save-path'], save_path='/mecab/')
else:
texts = glob.glob(config['Path']['save-path'] + '*.txt')
save_path = config['Path']['save-path']

texts = glob.glob(save_path + '*.txt')

# tokenizer
tokenizer = config['Pipelines']['Tokenizer']
Expand Down

0 comments on commit 2acc995

Please sign in to comment.