Skip to content

Commit

Permalink
Merge pull request #695 from PyThaiNLP/dev
Browse files Browse the repository at this point in the history
PyThaiNLP v3.1.0-dev1
  • Loading branch information
wannaphong authored Sep 1, 2022
2 parents 4862cf1 + a700564 commit 1b2f39b
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 14 deletions.
2 changes: 1 addition & 1 deletion docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ OSKut==1.3
nlpo3==1.2.2
thai-nner==0.3
spacy==2.3.*
wunsen==0.0.1
wunsen==0.0.3
khanaa==0.0.6
2 changes: 1 addition & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Copyright (C) 2016-2022 PyThaiNLP Project
# URL: <https://pythainlp.github.io/>
# For license information, see LICENSE
__version__ = "3.1.0-dev0"
__version__ = "3.1.0-dev1"

thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars

Expand Down
80 changes: 71 additions & 9 deletions pythainlp/transliterate/wunsen.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
to Thai text
By Wunsen
:See Also:
Expand All @@ -12,25 +13,40 @@

class WunsenTransliterate:
"""
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
to Thai text
by Wunsen
:See Also:
* `GitHub \
<https://github.com/cakimpei/wunsen>`_
"""

def __init__(self) -> None:
self.thap_value = None
self.lang = None
self.jp_input = None
self.zh_sandhi = None
self.system = None

def transliterate(self, text: str, lang: str, jp_input: str = None):
def transliterate(
self,
text: str,
lang: str,
jp_input: str = None,
zh_sandhi: bool = None,
system: str = None,
):
"""
Use Wunsen for transliteration
:param str text: text wants transliterated to Thai text.
:param str lang: source language
:param str jp_input: japanese input method (for japanese only)
:param bool zh_sandhi: mandarin third tone sandhi option
(for mandarin only)
:param str system: transliteration system (for japanese and
mandarin only)
:return: Thai text
:rtype: str
Expand All @@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
* *jp* - Japanese (from Hepburn romanization)
* *ko* - Korean (from Revised Romanization)
* *vi* - Vietnamese (Latin script)
* *zh* - Mandarin (from Hanyu Pinyin)
:Options for jp_input:
* *Hepburn-no diacritic* - Hepburn-no diacritic (without macron)
:Options for zh_sandhi:
* *True* - apply third tone sandhi rule
* *False* - do not apply third tone sandhi rule
:Options for system:
* *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
(สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
* *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
(ราชบัณฑิตยสถาน พ.ศ. 2535)
* *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน
(ราชบัณฑิตยสถาน พ.ศ. 2549)
* *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน
ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร
ภาษาจีน พ.ศ. 2543)
:Example:
::
Expand All @@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
)
# output: 'โอฮาโย'
wt.transliterate("ohayō", lang="jp", system="RI35")
# output: 'โอะฮะโย'
wt.transliterate("annyeonghaseyo", lang="ko")
# output: 'อันนย็องฮาเซโย'
wt.transliterate("xin chào", lang="vi")
# output: 'ซีน จ่าว'
wt.transliterate("ni3 hao3", lang="zh")
# output: 'หนี เห่า'
wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
# output: 'หนี่ เห่า'
wt.transliterate("ni3 hao3", lang="zh", system="RI49")
# output: 'หนี ห่าว'
"""
if self.lang != lang or self.jp_input != jp_input:
if (
self.lang != lang
or self.jp_input != jp_input
or self.zh_sandhi != zh_sandhi
or self.system != system
):
if lang == "jp":
if jp_input is None:
self.thap_value = ThapSap("ja")
else:
self.thap_value = ThapSap("ja", input=jp_input)
self.jp_input = jp_input
self.zh_sandhi = None
self.system = system
elif lang == "zh":
self.jp_input = None
self.zh_sandhi = zh_sandhi
self.system = system
elif lang == "ko" or lang == "vi":
self.jp_input = None
self.thap_value = ThapSap(lang)
self.zh_sandhi = None
self.system = None
else:
raise NotImplementedError(
"The %s language is not implemented." % lang
)
self.lang = lang
input_lang = lang
if input_lang == "jp":
input_lang = "ja"
setting = {}
if self.jp_input is not None:
setting.update({"input": self.jp_input})
if self.zh_sandhi is not None:
setting.update({"option": {"sandhi": self.zh_sandhi}})
if self.system is not None:
setting.update({"system": self.system})
self.thap_value = ThapSap(input_lang, **setting)
return self.thap_value.thap(text)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.0.8
current_version = 3.1.0
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,13 @@
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.1"
"wunsen>=0.0.3"
],
}

setup(
name="pythainlp",
version="3.1.0-dev0",
version="3.1.0-dev1",
description="Thai Natural Language Processing library",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
16 changes: 16 additions & 0 deletions tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def test_transliterate_wunsen(self):
),
'โอฮาโย'
)
self.assertEqual(
wt.transliterate("ohayō", lang="jp", system="RI35"),
'โอะฮะโย'
)
self.assertEqual(
wt.transliterate("annyeonghaseyo", lang="ko"),
'อันนย็องฮาเซโย'
Expand All @@ -179,6 +183,18 @@ def test_transliterate_wunsen(self):
wt.transliterate("xin chào", lang="vi"),
'ซีน จ่าว'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh"),
'หนี เห่า'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False),
'หนี่ เห่า'
)
self.assertEqual(
wt.transliterate("ni3 hao3", lang="zh", system="RI49"),
'หนี ห่าว'
)
with self.assertRaises(NotImplementedError):
wt.transliterate("xin chào", lang="vii")

Expand Down

0 comments on commit 1b2f39b

Please sign in to comment.