-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add test code for config and sampling
- Loading branch information
Showing
11 changed files
with
131 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
# vscode | ||
.vscode | ||
|
||
# data | ||
./data | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
tokenizers | ||
konlpy | ||
PyYAML | ||
PyYAML | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os | ||
from pathlib import Path | ||
import pytest | ||
from tools.config import cfg_from_yaml_file | ||
|
||
|
||
@pytest.fixture(scope="session", autouse=True) | ||
def setup_and_teardown_package(): | ||
test_dir = Path.cwd() / "tests/resources/" | ||
list_of_namuwiki_filepath = [test_dir / "namuwiki.01.txt", test_dir / "namuwiki.02.txt"] | ||
list_of_tests_samples = ["[목차]== 개요 ==BEMANI 시리즈의 악곡. \n", | ||
"보컬은 코사카 리유, Noria. \n", | ||
"롱버젼이 앨범 \"BeForU\"에 수록되었다. \n", | ||
"== 팝픈뮤직 == * 곡 목록으로 돌아가기팝픈뮤직 9에 처음 수록되었다. \n", | ||
"EX채보는 전체적으로 8비트 세로연타+동시치기 위주. \n", | ||
"후반부에 밀도가 조금 높아지므로 주의. \n", | ||
"17 무비 때 삭제되었다가 19 튠스트릿에서 다른 BeForU 멤버의 곡들과 함께 부활했다. \n", | ||
"=== 아티스트 코멘트 ===||제 안에서 BRE∀K DOWN!에 이은 걸즈 락 노선제 2탄이 ☆shining☆인 겁니다. \n", | ||
"이번에는 BeForU로부터 코사카 리유와 시리아시 노리아 두 사람을 유닛화해서,로서 전면적으로 기용했습니다. \n", | ||
"shining이라는 단어는, 제가 고등학생 시절부터제 자신을 던졌던 단어로, 살아가면서잊을 리가 없는, 외상()의 마음입니다."] | ||
|
||
for namuwiki_filepath in list_of_namuwiki_filepath: | ||
with open(namuwiki_filepath, mode="w", encoding="utf-8") as io: | ||
for sample in list_of_tests_samples: | ||
io.write(sample) | ||
|
||
yield | ||
|
||
for namuwiki_filepath in list_of_namuwiki_filepath: | ||
os.remove(namuwiki_filepath) | ||
|
||
|
||
@pytest.fixture(scope="package") | ||
def cfg(): | ||
yaml_path = Path.cwd() / "cfgs/pipelines/word_piece_with_morpheme.yaml" | ||
cfg = cfg_from_yaml_file(yaml_path) | ||
cfg['Path']['data-path'] = Path.cwd() / "tests/resources/namuwiki.*.txt" | ||
cfg['Path']['save-path'] = Path.cwd() / "tests/resources/samples/" | ||
cfg['Samples']['rate'] = 0.01 | ||
return cfg |
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from tools.config import cfg_from_yaml_file | ||
|
||
|
||
def test_configs(cfg): | ||
assert list(cfg.keys()) == ['Samples', 'Morpheme-aware', 'Path', 'Pipelines'] | ||
assert list(cfg.get('Samples').keys()) == ['rate'] | ||
assert cfg.get('Morpheme-aware') == True | ||
assert list(cfg.get('Path').keys()) == ['data-path', 'save-path'] | ||
assert list(cfg.get('Pipelines').keys()) == ['Tokenizer', 'normalizer', 'pre_tokenizer', 'decoder'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import glob | ||
import os | ||
from train_tokenizer import sampling, morphme | ||
|
||
def test_samplings(cfg): | ||
def tear_down(cfg): | ||
for i in glob.glob(str(cfg['Path']['save-path']) + '/**/*.txt', recursive=True): | ||
os.remove(i) | ||
|
||
sampling(data_path=cfg['Path']['data-path'], sample_rate=cfg['Samples']['rate'], save_path='/samples/') | ||
|
||
if cfg['Morpheme-aware']: | ||
morphme(data_path=cfg['Path']['save-path'], save_path='/mecab/') | ||
|
||
tear_down(cfg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import yaml | ||
|
||
# tokenizer-pre_tokenizers | ||
from tokenizers.pre_tokenizers import ( | ||
PreTokenizer, | ||
ByteLevel, | ||
Whitespace, | ||
WhitespaceSplit, | ||
BertPreTokenizer, | ||
Metaspace, | ||
CharDelimiterSplit, | ||
Punctuation, | ||
Sequence, | ||
Digits, | ||
UnicodeScripts, | ||
Split, | ||
) | ||
|
||
# tokenizer-normalizers | ||
from tokenizers.normalizers import NFKC, Lowercase | ||
|
||
# tokenizers | ||
from tools.word_piece import WordPieceTokenizer | ||
from tools.BBPE import ByteLevelBPETokenizer | ||
from tools.CBPE import CharBPETokenizer | ||
|
||
|
||
def cfg_from_yaml_file(cfg_file): | ||
def check_and_evalfunc(config): | ||
config['Pipelines']['Tokenizer'] = eval(config['Pipelines']['Tokenizer']) | ||
config['Pipelines']['normalizer'] = [eval(i) for i in config['Pipelines']['normalizer']] | ||
config['Pipelines']['pre_tokenizer'] = eval(config['Pipelines']['pre_tokenizer']) | ||
config['Pipelines']['decoder'] = eval(config['Pipelines']['decoder']) | ||
return config | ||
|
||
with open(cfg_file, 'r') as f: | ||
try: | ||
config = yaml.load(f, Loader=yaml.FullLoader) | ||
except: | ||
config = yaml.load(f) | ||
|
||
config = check_and_evalfunc(config) | ||
print(config) | ||
|
||
return config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters