From 15ec737001abd3632de4ed691b92f3cd0128e8c8 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Thu, 12 Dec 2024 22:50:14 +0700 Subject: [PATCH 1/5] PyThaiNLP v5.0.5 - Add clause_tokenize warnings #1026 --- CITATION.cff | 2 +- README.md | 2 +- README_TH.md | 2 +- pythainlp/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index a5e966080..78c16ab57 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: "Pattarawat" orcid: "https://orcid.org/0000-0000-0000-0000" title: "PyThaiNLP: Thai Natural Language Processing in Python" -version: v5.0.4 +version: v5.0.5 license: Apache-2.0 date-released: 2024-06-02 \ No newline at end of file diff --git a/README.md b/README.md index 5f20220bc..150332451 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร | Version | Description | Status | |:------:|:--:|:------:| -| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) | +| [5.0.5](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) | | [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) | ## Getting Started diff --git a/README_TH.md b/README_TH.md index c7b6d0a25..e09a6fe90 100644 --- a/README_TH.md +++ b/README_TH.md @@ -20,7 +20,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร | รุ่น | คำอธิบาย | สถานะ | |:------:|:--:|:------:| -| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) | +| [5.0.5](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) | | [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) | ติดตามพวกเราบน [PyThaiNLP Facebook page](https://www.facebook.com/pythainlp/) เพื่อรับข่าวสารเพิ่มเติม diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 486399802..a79455018 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -__version__ = "5.0.4" +__version__ = "5.0.5" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/setup.cfg b/setup.cfg index d4b3f16d7..40262cc74 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 5.0.4 +current_version = 5.0.5 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 949f1c20c..6f9bd926d 100644 --- a/setup.py +++ b/setup.py @@ -161,7 +161,7 @@ setup( name="pythainlp", - version="5.0.4", + version="5.0.5", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From b3e2d6e0f917f637d062a98b5e407b93977d5265 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 13 Dec 2024 12:23:11 +0000 Subject: [PATCH 2/5] Fix maiyamok() --- pythainlp/util/__init__.py | 5 +- pythainlp/util/normalize.py | 112 +++++++++++++++++++++++++----------- 2 files changed, 81 insertions(+), 36 deletions(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 6807c9d53..e4761e900 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -19,11 +19,13 @@ "display_thai_char", "emoji_to_thai", "eng_to_thai", + "expand_maiyamok", "find_keyword", "ipa_to_rtgs", "is_native_thai", "isthai", "isthaichar", + "maiyamok", "nectec_to_ipa", "normalize", "now_reign_year", @@ -85,8 +87,9 @@ from pythainlp.util.emojiconv import emoji_to_thai from pythainlp.util.keywords import find_keyword, rank from pythainlp.util.normalize import ( - normalize, + expand_maiyamok, maiyamok, + normalize, remove_dangling, remove_dup_spaces, remove_repeat_vowels, diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index bee233afe..8f7ec3d0d 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -4,6 +4,7 @@ """ Text normalization """ + import re from typing import List, Union @@ -76,7 +77,7 @@ def remove_dangling(text: str) -> str: from pythainlp.util import remove_dangling - remove_dangling('๊ก') + remove_dangling("๊ก") # output: 'ก' """ return _RE_REMOVE_DANGLINGS.sub("", text) @@ -98,7 +99,7 @@ def remove_dup_spaces(text: str) -> str: from pythainlp.util import remove_dup_spaces - remove_dup_spaces('ก ข ค') + remove_dup_spaces("ก ข ค") # output: 'ก ข ค' """ while " " in text: @@ -132,7 +133,7 @@ def remove_tonemark(text: str) -> str: from pythainlp.util import remove_tonemark - remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด') + remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด") # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด """ for ch in tonemarks: @@ -235,10 +236,10 @@ def normalize(text: str) -> str: from pythainlp.util import normalize - normalize('เเปลก') # starts with two Sara E + normalize("เเปลก") # starts with two Sara E # output: แปลก - normalize('นานาาา') + normalize("นานาาา") # output: นานา """ text = remove_zw(text) @@ -249,46 +250,87 @@ def normalize(text: str) -> str: return text -def maiyamok(sent: Union[str, List[str]]) -> List[str]: +def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]: """ - Thai MaiYaMok + Expand Maiyamok. - MaiYaMok (ๆ) is the mark of duplicate word in Thai language. - This function is preprocessing MaiYaMok in Thai sentence. + Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word + repetition. This function preprocesses Thai text by replacing + Maiyamok with a word being repeated. - :param Union[str, List[str]] sent: input sentence (list or str) + :param Union[str, List[str]] sent: sentence (list or string) :return: list of words :rtype: List[str] :Example: :: + from pythainlp.util import expand_maiyamok - from pythainlp.util import maiyamok - - maiyamok("เด็กๆชอบไปโรงเรียน") - # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] - - maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"]) - # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้'] + expand_maiyamok("คนๆนก") + # output: ['คน', 'คน', 'นก'] """ if isinstance(sent, str): sent = word_tokenize(sent) - _list_word = [] - i = 0 - for j, text in enumerate(sent): - if text.isspace() and "ๆ" in sent[j + 1]: - continue - if " ๆ" in text: - text = text.replace(" ๆ", "ๆ") - if "ๆ" == text: - text = _list_word[i - 1] - elif "ๆ" in text: - count = text.count("ๆ") - text = _list_word[i - 1] - for _ in range(count): - _list_word.append(text) - i += 1 + + # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน" + temp_toks: list[str] = [] + for _, token in enumerate(sent): + toks = re.split(r"(ๆ)", token) + toks = [tok for tok in toks if tok] # remove empty string ("") + temp_toks.extend(toks) + sent = temp_toks + + output_toks: list[str] = [] + + yamok = "ๆ" + yamok_count = 0 + len_sent = len(sent) + for i in range(len_sent - 1, -1, -1): # do it backward + if yamok_count == 0 or (i + 1 >= len_sent): + if sent[i] == yamok: + yamok_count = yamok_count + 1 + else: + output_toks.append(sent[i]) continue - _list_word.append(text) - i += 1 - return _list_word + + if sent[i] == yamok: + yamok_count = yamok_count + 1 + else: + if sent[i].isspace(): + if yamok_count > 0: # remove space before yamok + continue + else: # with preprocessing above, this should not happen + output_toks.append(sent[i]) + else: + output_toks.extend([sent[i]] * (yamok_count + 1)) + yamok_count = 0 + + return output_toks[::-1] + + +def maiyamok(sent: Union[str, List[str]]) -> List[str]: + """ + Expand Maiyamok. + + Deprecated. Use expand_maiyamok() instead. + + Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word + repetition. This function preprocesses Thai text by replacing + Maiyamok with a word being repeated. + + :param Union[str, List[str]] sent: sentence (list or string) + :return: list of words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import expand_maiyamok + + expand_maiyamok("คนๆนก") + # output: ['คน', 'คน', 'นก'] + """ + warn_deprecation( + "pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok", "5.2" + ) + return expand_maiyamok(sent) From 6ccc11c19625456949574e1a4e191e28a610c633 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 13 Dec 2024 12:38:40 +0000 Subject: [PATCH 3/5] Add warn_deprecation --- pythainlp/tokenize/core.py | 17 +++++++------ pythainlp/tools/__init__.py | 4 ++++ pythainlp/tools/core.py | 48 +++++++++++++++++++++++++++++++++++++ pythainlp/util/normalize.py | 7 ++++-- 4 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 pythainlp/tools/core.py diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index e14389b67..80c9342ca 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -4,8 +4,8 @@ """ Generic functions of tokenizers """ + import re -import warnings from typing import Iterable, List, Union from pythainlp.tokenize import ( @@ -21,6 +21,7 @@ rejoin_formatted_num, strip_whitespace, ) +from pythainlp.tools import warn_deprecation from pythainlp.util.trie import Trie, dict_trie @@ -45,13 +46,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: # ['และ', 'คุณ', 'เล่น', 'มือถือ'], # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']] """ + warn_deprecation("pythainlp.util.clause_tokenize", "", "5.0.5", "5.1") from pythainlp.tokenize.crfcls import segment - warnings.warn( - """ - clause_tokenize is no longer supported \ - and will be removed in version 5.1. - """, DeprecationWarning) return segment(doc) @@ -71,6 +68,7 @@ def word_detokenize( :: from pythainlp.tokenize import word_detokenize + print(word_detokenize(["เรา", "เล่น"])) # output: เราเล่น """ @@ -299,18 +297,19 @@ def word_tokenize( segments = segment(text) elif engine == "nlpo3": from pythainlp.tokenize.nlpo3 import segment + # Currently cannot handle custom_dict from inside word_tokenize(), # due to difference in type. - #if isinstance(custom_dict, str): + # if isinstance(custom_dict, str): # segments = segment(text, custom_dict=custom_dict) - #elif not isinstance(custom_dict, str) and not custom_dict: + # elif not isinstance(custom_dict, str) and not custom_dict: # raise ValueError( # f"""Tokenizer \"{engine}\": # custom_dict must be a str. # It is a dictionary name as assigned with load_dict(). # See pythainlp.tokenize.nlpo3.load_dict()""" # ) - #else: + # else: # segments = segment(text) segments = segment(text) else: diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 0e07a37b5..db0f6e491 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -6,8 +6,12 @@ "get_full_data_path", "get_pythainlp_data_path", "get_pythainlp_path", + "safe_print", + "warn_deprecation", ] +from pythainlp.tools.core import safe_print, warn_deprecation + from pythainlp.tools.path import ( PYTHAINLP_DEFAULT_DATA_DIR, get_full_data_path, diff --git a/pythainlp/tools/core.py b/pythainlp/tools/core.py new file mode 100644 index 000000000..8e319657e --- /dev/null +++ b/pythainlp/tools/core.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +""" +Generic support functions for PyThaiNLP. +""" + +import sys +import warnings + + +def warn_deprecation( + deprecated_func: str, + replacing_func: str = "", + deprecated_version: str = "", + removal_version: str = "", +): + """Warn about the deprecation of a function. + + :param str deprecated_func: Name of the deprecated function. + :param str replacing_func: Name of the function to use instead (optional). + :param str version: PyThaiNLP version in which the function will be deprecated (optional). + """ + message = f"The '{deprecated_func}' function is deprecated" + if deprecated_version: + message += f" since {deprecated_version}" + if not removal_version: + removal_version = "a future release" + message += f" and will be removed in {removal_version}." + if replacing_func: + message += f" Please use '{replacing_func}' instead." + warnings.warn(message, DeprecationWarning, stacklevel=2) + + +def safe_print(text: str): + """Print text to console, handling UnicodeEncodeError. + + :param text: Text to print. + :type text: str + """ + try: + print(text) + except UnicodeEncodeError: + print( + text.encode(sys.stdout.encoding, errors="replace").decode( + sys.stdout.encoding + ) + ) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 8f7ec3d0d..d03f7c7b3 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -14,7 +14,7 @@ from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks from pythainlp.tokenize import word_tokenize - +from pythainlp.tools import warn_deprecation _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" _RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") @@ -331,6 +331,9 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: # output: ['คน', 'คน', 'นก'] """ warn_deprecation( - "pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok", "5.2" + "pythainlp.util.maiyamok", + "pythainlp.util.expand_maiyamok", + "5.0.5", + "5.2", ) return expand_maiyamok(sent) From a31f117554e865177dca6d11f78122b0513957f3 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 13 Dec 2024 12:44:18 +0000 Subject: [PATCH 4/5] Update docstring of warn_deprecation --- pythainlp/tools/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/tools/core.py b/pythainlp/tools/core.py index 8e319657e..1c5e5daa2 100644 --- a/pythainlp/tools/core.py +++ b/pythainlp/tools/core.py @@ -19,7 +19,8 @@ def warn_deprecation( :param str deprecated_func: Name of the deprecated function. :param str replacing_func: Name of the function to use instead (optional). - :param str version: PyThaiNLP version in which the function will be deprecated (optional). + :param str deprecated_version: PyThaiNLP version in which the function will be deprecated (optional). + :param str removal_version: PyThaiNLP version in which the function will be removed (optional). """ message = f"The '{deprecated_func}' function is deprecated" if deprecated_version: From b804d4132dcb9853c86026f0fce7773e50b98ad3 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 13 Dec 2024 14:41:23 +0000 Subject: [PATCH 5/5] Use warn_deprecated --- pythainlp/cls/__init__.py | 6 ++---- pythainlp/corpus/common.py | 15 ++++++++++----- pythainlp/phayathaibert/core.py | 2 +- pythainlp/tools/core.py | 4 ++-- pythainlp/util/thaiwordcheck.py | 16 +++++++++------- pythainlp/wangchanberta/core.py | 2 +- 6 files changed, 25 insertions(+), 20 deletions(-) diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py index d4cc162f7..d036cef1d 100644 --- a/pythainlp/cls/__init__.py +++ b/pythainlp/cls/__init__.py @@ -5,12 +5,10 @@ pythainlp.cls Depreciated. Use pythainlp.classify instead. """ -import warnings __all__ = ["GzipModel"] from pythainlp.classify.param_free import GzipModel +from pythainlp.tools import warn_deprecation -warnings.warn( - "Deprecated: Use pythainlp.classify instead.", DeprecationWarning -) +warn_deprecation("pythainlp.cls", "pythainlp.classify", "5.1", "5.2") diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index a44544aa6..8c822d843 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -24,9 +24,9 @@ ] from typing import FrozenSet, List, Union -import warnings from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path +from pythainlp.tools import warn_deprecation _THAI_COUNTRIES: FrozenSet[str] = frozenset() _THAI_COUNTRIES_FILENAME = "countries_th.txt" @@ -56,9 +56,9 @@ _THAI_ORST_WORDS: FrozenSet[str] = frozenset() -_THAI_DICT = {} -_THAI_WSD_DICT = {} -_THAI_SYNONYMS = {} +_THAI_DICT: dict[str, list] = {} +_THAI_WSD_DICT: dict[str, list] = {} +_THAI_SYNONYMS: dict[str, list] = {} def countries() -> FrozenSet[str]: @@ -336,7 +336,12 @@ def thai_synonyms() -> dict: def thai_synonym() -> dict: - warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning) + warn_deprecation( + "pythainlp.corpus.thai_synonym", + "pythainlp.corpus.thai_synonyms", + "5.1", + "5.2", + ) return thai_synonyms() diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index f3563a45e..acd5d39cb 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -394,7 +394,7 @@ def get_ner( if pos: warnings.warn( "This model doesn't support output \ - postag and It doesn't output the postag." + postag and it doesn't output the postag." ) sample_output = [] diff --git a/pythainlp/tools/core.py b/pythainlp/tools/core.py index 1c5e5daa2..54396f4a2 100644 --- a/pythainlp/tools/core.py +++ b/pythainlp/tools/core.py @@ -19,8 +19,8 @@ def warn_deprecation( :param str deprecated_func: Name of the deprecated function. :param str replacing_func: Name of the function to use instead (optional). - :param str deprecated_version: PyThaiNLP version in which the function will be deprecated (optional). - :param str removal_version: PyThaiNLP version in which the function will be removed (optional). + :param str deprecated_version: Version in which the function will be deprecated (optional). + :param str removal_version: Version in which the function will be removed (optional). """ message = f"The '{deprecated_func}' function is deprecated" if deprecated_version: diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py index 0fe3f296b..a67071e70 100644 --- a/pythainlp/util/thaiwordcheck.py +++ b/pythainlp/util/thaiwordcheck.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -import warnings +from pythainlp.tools import warn_deprecation + def is_native_thai(word: str) -> bool: - warnings.warn( - """ - pythainlp.util.is_native_thai is rename as \ - pythainlp.morpheme.is_native_thai. - This function will remove in PyThaiNLP 5.1. - """, DeprecationWarning) + warn_deprecation( + "pythainlp.util.is_native_thai", + "pythainlp.morpheme.is_native_thai", + "5.0", + "5.1", + ) + from pythainlp.morpheme import is_native_thai as check return check(word) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index afae29c40..4227be8fa 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -182,7 +182,7 @@ def get_ner( if pos: warnings.warn( - "This model doesn't support output postag and It doesn't output the postag." + "This model doesn't support output postag and it doesn't output the postag." ) words_token = word_tokenize(text.replace(" ", "<_>")) inputs = self.tokenizer(