Skip to content

Commit af7910a

Browse files
authored
Merge pull request #753 from PyThaiNLP/improve-1
Add Reduce import time #719 to PyThaiNLP 3.1.1
2 parents ddd785d + f3bc46e commit af7910a

File tree

15 files changed

+25
-33
lines changed

15 files changed

+25
-33
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil
1919
PyThaiNLP เป็นไลบารีภาษาไพทอนสำหรับประมวลผลภาษาธรรมชาติ คล้ายกับ NLTK โดยเน้นภาษาไทย [ดูรายละเอียดภาษาไทยได้ที่ README_TH.MD](https://github.com/PyThaiNLP/pythainlp/blob/dev/README_TH.md)
2020

2121
**News**
22+
> PyThaiNLP join Hacktoberfest 2022!! https://github.com/PyThaiNLP/pythainlp/issues/717
2223
2324
> Now, You can contact or ask any questions with the PyThaiNLP team. <a href="https://matrix.to/#/#thainlp:matrix.org" rel="noopener" target="_blank"><img src="https://matrix.to/img/matrix-badge.svg" alt="Chat on Matrix"></a>
2425

docs/api/tools.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ Modules
1010
.. autofunction:: get_full_data_path
1111
.. autofunction:: get_pythainlp_data_path
1212
.. autofunction:: get_pythainlp_path
13-
.. autofunction:: misspell
13+
.. autofunction:: pythainlp.tools.misspell.misspell

pythainlp/augment/word2vec/bpemb_wv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# -*- coding: utf-8 -*-
22
from pythainlp.augment.word2vec.core import Word2VecAug
3-
from bpemb import BPEmb
43
from typing import List, Tuple
54

65

@@ -12,6 +11,7 @@ class BPEmbAug:
1211
`github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
1312
"""
1413
def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
14+
from bpemb import BPEmb
1515
self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
1616
self.model = self.bpemb_temp.emb
1717
self.load_w2v()

pythainlp/augment/word2vec/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# -*- coding: utf-8 -*-
22
from typing import List, Tuple
3-
import gensim.models.keyedvectors as word2vec
43
import itertools
54

65

@@ -13,6 +12,7 @@ def __init__(
1312
:param object tokenize: tokenize function
1413
:param str type: moodel type (file, binary)
1514
"""
15+
import gensim.models.keyedvectors as word2vec
1616
self.tokenizer = tokenize
1717
if type == "file":
1818
self.model = word2vec.KeyedVectors.load_word2vec_format(model)

pythainlp/corpus/core.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,29 @@
22
"""
33
Corpus related functions.
44
"""
5-
6-
import hashlib
75
import os
86
from typing import Union
9-
from urllib.request import urlopen
107
import json
118

12-
import requests
139
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
1410
from pythainlp.tools import get_full_data_path
15-
from requests.exceptions import HTTPError
16-
import tarfile
17-
import zipfile
18-
import shutil
1911
from pythainlp import __version__
2012

2113

2214
_CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE")
2315

2416

25-
def get_corpus_db(url: str) -> requests.Response:
17+
def get_corpus_db(url: str):
2618
"""
2719
Get corpus catalog from server.
2820
2921
:param str url: URL corpus catalog
3022
"""
23+
import requests
3124
corpus_db = None
3225
try:
3326
corpus_db = requests.get(url)
34-
except HTTPError as http_err:
27+
except requests.exceptions.HTTPError as http_err:
3528
print(f"HTTP error occurred: {http_err}")
3629
except Exception as err:
3730
print(f"Non-HTTP error occurred: {err}")
@@ -231,6 +224,8 @@ def _download(url: str, dst: str) -> int:
231224
"""
232225
_CHUNK_SIZE = 64 * 1024 # 64 KiB
233226

227+
import requests
228+
from urllib.request import urlopen
234229
file_size = int(urlopen(url).info().get("Content-Length", -1))
235230
r = requests.get(url, stream=True)
236231
with open(get_full_data_path(dst), "wb") as f:
@@ -262,6 +257,7 @@ def _check_hash(dst: str, md5: str) -> None:
262257
@param: md5 place to hash the file (MD5)
263258
"""
264259
if md5 and md5 != "-":
260+
import hashlib
265261
with open(get_full_data_path(dst), "rb") as f:
266262
content = f.read()
267263
file_md5 = hashlib.md5(content).hexdigest()
@@ -423,13 +419,15 @@ def download(
423419
foldername = None
424420

425421
if corpus_versions["is_tar_gz"] == "True":
422+
import tarfile
426423
is_folder = True
427424
foldername = name+"_"+str(version)
428425
if not os.path.exists(get_full_data_path(foldername)):
429426
os.mkdir(get_full_data_path(foldername))
430427
with tarfile.open(get_full_data_path(file_name)) as tar:
431428
tar.extractall(path=get_full_data_path(foldername))
432429
elif corpus_versions["is_zip"] == "True":
430+
import zipfile
433431
is_folder = True
434432
foldername = name+"_"+str(version)
435433
if not os.path.exists(get_full_data_path(foldername)):
@@ -520,6 +518,7 @@ def remove(name: str) -> bool:
520518
if data:
521519
path = get_corpus_path(name)
522520
if data[0].get("is_folder"):
521+
import shutil
523522
os.remove(get_full_data_path(data[0].get("filename")))
524523
shutil.rmtree(path, ignore_errors=True)
525524
else:

pythainlp/tag/_tag_perceptron.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,7 @@
1414
1515
This tagger is provided under the terms of the MIT License.
1616
"""
17-
18-
from __future__ import absolute_import
19-
2017
import json
21-
import random
2218
from collections import defaultdict
2319
from typing import Dict, Iterable, List, Tuple, Union
2420

@@ -160,6 +156,7 @@ def train(
160156
location.
161157
:param nr_iter: Number of training iterations.
162158
"""
159+
import random
163160
self._make_tagdict(sentences)
164161
self.model.classes = self.classes
165162
for _ in range(nr_iter):

pythainlp/tag/thainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from typing import Dict, List, Tuple, Union
99

10-
from pycrfsuite import Tagger as CRFTagger
1110
from pythainlp.corpus import get_corpus_path, thai_stopwords
1211
from pythainlp.tag import pos_tag
1312
from pythainlp.tokenize import word_tokenize
@@ -98,6 +97,7 @@ def __init__(self, version: str = "1.5") -> None:
9897
It's support Thai NER 1.4 & 1.5.
9998
The defualt value is `1.5`
10099
"""
100+
from pycrfsuite import Tagger as CRFTagger
101101
self.crf = CRFTagger()
102102

103103
if version == "1.4":

pythainlp/tag/wangchanberta_onnx.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
# -*- coding: utf-8 -*-
22
from typing import List
33
import json
4-
import sentencepiece as spm
4+
55
import numpy as np
6-
from onnxruntime import (
7-
InferenceSession, SessionOptions, GraphOptimizationLevel
8-
)
96
from pythainlp.corpus import get_path_folder_corpus
107

118

129
class WngchanBerta_ONNX:
1310
def __init__(self, model_name: str, model_version: str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None:
11+
import sentencepiece as spm
12+
from onnxruntime import (
13+
InferenceSession, SessionOptions, GraphOptimizationLevel
14+
)
1415
self.model_name = model_name
1516
self.model_version = model_version
1617
self.options = SessionOptions()

pythainlp/tokenize/core.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""
55
import re
66
from typing import Iterable, List, Union
7-
import warnings
87

98
from pythainlp.tokenize import (
109
DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -14,7 +13,6 @@
1413
DEFAULT_WORD_DICT_TRIE,
1514
DEFAULT_WORD_TOKENIZE_ENGINE,
1615
)
17-
from pythainlp import thai_characters
1816
from pythainlp.util.trie import Trie, dict_trie
1917

2018

@@ -63,6 +61,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
6361
_list_all = []
6462
if isinstance(segments[0], str):
6563
segments = [segments]
64+
from pythainlp import thai_characters
6665
for i, s in enumerate(segments):
6766
_list_sents = []
6867
_add_index = []

pythainlp/tools/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"get_full_data_path",
55
"get_pythainlp_data_path",
66
"get_pythainlp_path",
7-
"misspell",
87
]
98

109
from pythainlp.tools.path import (
@@ -13,5 +12,3 @@
1312
get_pythainlp_data_path,
1413
get_pythainlp_path,
1514
)
16-
17-
from pythainlp.tools.misspell import misspell

0 commit comments

Comments
 (0)