Skip to content

Commit

Permalink
Merge pull request #15 from andreihar/14-transliteration-conversion
Browse files Browse the repository at this point in the history
14 transliteration conversion
  • Loading branch information
andreihar authored Apr 26, 2024
2 parents d3dec16 + 2283a60 commit 9941098
Show file tree
Hide file tree
Showing 11 changed files with 134 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
fail-fast: false
matrix:
platform: [ubuntu-latest, macos-latest, windows-latest]
python: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12-dev']
python: ['3.8', '3.9', '3.10', '3.11', '3.12-dev']
name: Test
runs-on: ${{ matrix.platform }}
steps:
Expand Down
4 changes: 2 additions & 2 deletions taibun/data/words.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"諾": "--hioh",
"乎": "--honnh",
"唷": "--ioh",
"咧": "--leh",
"咧": "leh",
"喔": "--ooh",
"嘖": "--tsheh",
"的": "--ê",
Expand Down Expand Up @@ -2155,7 +2155,7 @@
"呃酸": "eh-sng",
"厄運": "eh-ūn",
"嬰仔名": "enn-á-miâ/inn-á-miâ",
"嬰仔": "enn-á/inn--á",
"嬰仔": "enn-á/inn-á",
"嬰": "enn/inn",
"狹小": "e̍h-sió/ue̍h-sió",
"狹細": "e̍h-sè/ue̍h-suè",
Expand Down
58 changes: 36 additions & 22 deletions taibun/taibun.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
"""
Description: Converts Chinese characters to Taiwanese Hokkien phonetic transcriptions.
Supports both Traditional and Simplified characters.
Invariant: dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
Invariant: system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
format = `mark` (diacritical), `number` (numeric), `strip` (no tones)
sandhi = True, False
delimiter = String that replaces the default delimiter
sandhi = `auto`, `none`, `exc_last`, `incl_last`
punctuation = `format` (Latin-style, default), `none` (preserve original)
convert_non_cjk = True, False
"""


Expand Down Expand Up @@ -41,14 +42,17 @@ class Converter(object):
tt = '[ТŊ_ТКŊ]'
DEFAULT_DELIMITER = object()
DEFAULT_SANDHI = object()
__suffixes = ['啊','矣','喂','欸','唅','嘿','諾','乎','唷','喔','嘖','的']
__no_sandhi = ['這','彼','遮','遐']

def __init__(self, system='Tailo', dialect='south', format='mark', delimiter=DEFAULT_DELIMITER, sandhi=DEFAULT_SANDHI, punctuation='format'):
def __init__(self, system='Tailo', dialect='south', format='mark', delimiter=DEFAULT_DELIMITER, sandhi=DEFAULT_SANDHI, punctuation='format', convert_non_cjk=False):
self.system = system.lower()
self.dialect = dialect.lower()
self.format = format
self.delimiter = delimiter
self.sandhi = sandhi
self.punctuation = punctuation
self.convert_non_cjk = convert_non_cjk


### Interface functions
Expand All @@ -69,12 +73,13 @@ def get(self, input):

# Helper to convert separate words
def __convert_tokenised(self, word):
if word[0] not in word_dict:
if word[0] in word_dict:
word = (word_dict[word[0]],) + word[1:]
if "/" in word[0]:
dialect_part = word[0].split("/")[1] if self.dialect == 'north' else word[0].split("/")[0]
word = (dialect_part,) + word[1:]
elif not self.convert_non_cjk:
return word[0]
word = (word_dict[word[0]],) + word[1:]
if "/" in word[0]:
dialect_part = word[0].split("/")[1] if self.dialect == 'north' else word[0].split("/")[0]
word = (dialect_part,) + word[1:]
word = self.__system_conversion(word).replace('---', '--')
if self.format == 'number' and self.system in ['tailo', 'poj']:
word = self.__mark_to_number(word)
Expand All @@ -97,7 +102,7 @@ def __system_conversion(self, word):
if self.system == 'pingyim': return self.__tailo_to_pingyim(word)
if self.system == 'tongiong': return self.__tailo_to_ti(word)
if self.system == 'ipa': return self.__tailo_to_ipa(word)
if self.sandhi: return self.__tailo_to_tailo(word)
if self.sandhi in ['auto', 'exc_last', 'incl_last']: return self.__tailo_to_tailo(word)
else: return word[0]


Expand All @@ -110,8 +115,8 @@ def __set_default_delimiter(self):

# Helper functions to set sandhi according to transliteration system if wasn't explicitly defined by user
def __set_default_sandhi(self):
if self.system == 'tongiong': return True
return False
if self.system == 'tongiong': return 'auto'
return 'none'


### Conversion functions
Expand All @@ -120,8 +125,12 @@ def __set_default_sandhi(self):
def __get_number_tones(self, input):
words = self.__preprocess_word(input[0])
number_tones = [self.__get_number_tone(w) for w in words if len(w) > 0]
if self.sandhi:
number_tones = self.__tone_sandhi(number_tones, input[1])
if self.sandhi in ['auto', 'exc_last', 'incl_last'] or self.format == 'number':
replace_with_zero = False
number_tones = [s[:-1] + '0' if replace_with_zero or (replace_with_zero := s[-1] == '0') else s for s in number_tones]
if self.sandhi in ['auto', 'exc_last', 'incl_last']:
index = next((i for i, s in enumerate(number_tones) if s.startswith(self.suffix_token)), len(number_tones))
number_tones = self.__tone_sandhi(number_tones[:index], False) + number_tones[index:] if len(number_tones) != index and len(number_tones) > 1 else self.__tone_sandhi(number_tones, input[1])
return number_tones


Expand Down Expand Up @@ -149,7 +158,7 @@ def __get_number_tone(self, input):
elif re.search('̍', input): input += '8'
elif input[-1] in finals: input += '4'
else: input += '1'
if input[0] == '+' and input[-1] == '4':
if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi in ['auto', 'exc_last', 'incl_last'] or self.format == 'number'):
input = input[:-1] + '0'
input = "".join(c for c in unicodedata.normalize("NFD", input) if unicodedata.category(c) != "Mn")
return input
Expand All @@ -164,9 +173,9 @@ def __preprocess_word(self, word):
def __get_mark_tone(self, input, placement, tones):
for s in placement:
if s.replace(self.tt, '') in input:
part = s
input = input.replace(s.replace(self.tt, ''), s.replace(self.tt, tones[int(input[-1])]))
break
return unicodedata.normalize('NFC', input.replace(part.replace(self.tt, ''), part.replace(self.tt, tones[int(input[-1])]))[:-1])
return unicodedata.normalize('NFC', input[:-1])


# Helper to apply tone sandhi to a word
Expand All @@ -183,9 +192,14 @@ def __tone_sandhi(self, words, last):

# Helper to define which words should be sandhi'd fully
def __tone_sandhi_position(self, input):
result_list = []
for i, char in enumerate(input):
result_list.append((char, (i < len(input) - 1 and is_cjk(input[i+1]))))
sandhi_logic = {
'exc_last': [(char, False if i == len(input) - 1 else True) for i, char in enumerate(input)],
'incl_last': [(char, True) for char in input],
}
result_list = sandhi_logic.get(self.sandhi, [(char, False if char in self.__no_sandhi else (i < len(input) - 1 and is_cjk(input[i+1]))) for i, char in enumerate(input)])
for i in range(len(result_list) - 2, -1, -1):
if result_list[i+1][0] in self.__suffixes:
result_list[i] = (result_list[i][0], False)
return result_list


Expand Down Expand Up @@ -224,7 +238,7 @@ def __tailo_to_poj(self, input):
# Helper to convert syllable from Tai-lo to 方音符號 (zhuyin)
def __tailo_to_zhuyin(self, input):
convert = {
'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8',
'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8', 'h0': '0',
'tshing':'ㄑㄧㄥ', 'tshinn':'ㄑㆪ', 'phing':'ㄆㄧㄥ', 'phinn':'ㄆㆪ', 'tsing':'ㄐㄧㄥ', 'tsinn':'ㄐㆪ',
'ainn':'ㆮ', 'aunn':'ㆯ', 'giok':'ㆣㄧㄜㆶ', 'ngai':'ㄫㄞ', 'ngau':'ㄫㄠ', 'ngoo':'ㄫㆦ', 'ping':'ㄅㄧㄥ',
'pinn':'ㄅㆪ', 'senn':'ㄙㆥ', 'sing':'ㄒㄧㄥ', 'sinn':'ㄒㆪ', 'tshi':'ㄑㄧ',
Expand Down Expand Up @@ -334,7 +348,7 @@ def __tailo_to_ipa(self, input):
if self.dialect == 'north':
convert.update({'o':'o'})
convert2 = {
'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi'}
'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi','h0':'ʔ0'}
tones = ['', '⁴⁴', '⁵³', '¹¹', '²¹', '²⁵', '', '²²', '⁵'] if self.dialect != 'north' else ['', '⁵⁵', '⁵¹', '²¹', '³²', '²⁴', '', '³³', '⁴']
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
convert2.update({k.capitalize(): v.capitalize() for k, v in convert2.items()})
Expand Down
8 changes: 4 additions & 4 deletions tests/test_delimiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_default():
(["ㄆㄧㄠ ㄍㆤ˪","ㄢˋ ㄋㆤ/ㄢˋ ㄋㄧ","ㄍㄞ˪ ㄒㄧㄠ˫ ㆢㄧㄣˊ/ㄍㄞ˪ ㄒㄧㄠ˫ ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㆢㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㄌㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ ㄅㆭ˫"], "Zhuyin"),
(["phiau1 ke3","an2 ne1/an2 ni1","kai3 siau7 jin5/kai3 siau7 lin5","ciok4 li2 kin1 a2 jit8 cin1 ho2 e5 jit8 ci2/ciok4 li2 kin1 a2 lit8 cin1 ho2 e5 lit8 ci2","lan2 e5 ciah8 png7"], "TLPA"),
(["piāogè","ǎnlnē/ǎnlnī","gàisiâozzín/gàisiâolín","ziōk lǐ gīnǎzzít zīnhǒ é zzítzǐ/ziōk lǐ gīnǎlít zīnhǒ é lítzǐ","lǎn é ziáhbn̂g"], "Pingyim"),
(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hor--ē rīt-zì/ziok li gīn-a-līt zīn-hor--ê līt-zì","lan--ē ziâ-bn̄g"], "Tongiong")
(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hòr--e̊ rīt-zì/ziok li gīn-a-līt zīn-hòr--e̊ līt-zì","làn--e̊ ziâ-bn̄g"], "Tongiong")
]
for transl, system in test_data:
data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
Expand All @@ -23,7 +23,7 @@ def test_hyphen():
(["ㄆㄧㄠ-ㄍㆤ˪","ㄢˋ-ㄋㆤ/ㄢˋ-ㄋㄧ","ㄍㄞ˪-ㄒㄧㄠ˫-ㆢㄧㄣˊ/ㄍㄞ˪-ㄒㄧㄠ˫-ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ-ㄚˋ-ㆢㄧㆵ˙ ㄐㄧㄣ-ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙-ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ-ㄚˋ-ㄌㄧㆵ˙ ㄐㄧㄣ-ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙-ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙-ㄅㆭ˫"], "Zhuyin"),
(["phiau1-ke3","an2-ne1/an2-ni1","kai3-siau7-jin5/kai3-siau7-lin5","ciok4 li2 kin1-a2-jit8 cin1-ho2 e5 jit8-ci2/ciok4 li2 kin1-a2-lit8 cin1-ho2 e5 lit8-ci2","lan2 e5 ciah8-png7"], "TLPA"),
(["piāo-gè","ǎn-lnē/ǎn-lnī","gài-siâo-zzín/gài-siâo-lín","ziōk lǐ gīn-ǎ-zzít zīn-hǒ é zzít-zǐ/ziōk lǐ gīn-ǎ-lít zīn-hǒ é lít-zǐ","lǎn é ziáh-bn̂g"], "Pingyim"),
(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hor--ē rīt-zì/ziok li gīn-a-līt zīn-hor--ê līt-zì","lan--ē ziâ-bn̄g"], "Tongiong")
(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hòr--e̊ rīt-zì/ziok li gīn-a-līt zīn-hòr--e̊ līt-zì","làn--e̊ ziâ-bn̄g"], "Tongiong")
]
for transl, system in test_data:
data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
Expand All @@ -36,7 +36,7 @@ def test_space():
(["ㄆㄧㄠ ㄍㆤ˪","ㄢˋ ㄋㆤ/ㄢˋ ㄋㄧ","ㄍㄞ˪ ㄒㄧㄠ˫ ㆢㄧㄣˊ/ㄍㄞ˪ ㄒㄧㄠ˫ ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㆢㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㄌㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ ㄅㆭ˫"], "Zhuyin"),
(["phiau1 ke3","an2 ne1/an2 ni1","kai3 siau7 jin5/kai3 siau7 lin5","ciok4 li2 kin1 a2 jit8 cin1 ho2 e5 jit8 ci2/ciok4 li2 kin1 a2 lit8 cin1 ho2 e5 lit8 ci2","lan2 e5 ciah8 png7"], "TLPA"),
(["piāo gè","ǎn lnē/ǎn lnī","gài siâo zzín/gài siâo lín","ziōk lǐ gīn ǎ zzít zīn hǒ é zzít zǐ/ziōk lǐ gīn ǎ lít zīn hǒ é lít zǐ","lǎn é ziáh bn̂g"], "Pingyim"),
(["piāu gê","an ne/an ni","gài siâu rĭn/gài siâu lĭn","ziok li gīn a rīt zīn hor--ē rīt zì/ziok li gīn a līt zīn hor--ê līt zì","lan--ē ziâ bn̄g"], "Tongiong")
(["piāu gê","an ne/an ni","gài siâu rĭn/gài siâu lĭn","ziok li gīn a rīt zīn hòr--e̊ rīt zì/ziok li gīn a līt zīn hòr--e̊ līt zì","làn--e̊ ziâ bn̄g"], "Tongiong")
]
for transl, system in test_data:
data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
Expand All @@ -49,7 +49,7 @@ def test_nospace():
(["ㄆㄧㄠㄍㆤ˪","ㄢˋㄋㆤ/ㄢˋㄋㄧ","ㄍㄞ˪ㄒㄧㄠ˫ㆢㄧㄣˊ/ㄍㄞ˪ㄒㄧㄠ˫ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣㄚˋㆢㄧㆵ˙ ㄐㄧㄣㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣㄚˋㄌㄧㆵ˙ ㄐㄧㄣㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ㄅㆭ˫"], "Zhuyin"),
(["phiau1ke3","an2ne1/an2ni1","kai3siau7jin5/kai3siau7lin5","ciok4 li2 kin1a2jit8 cin1ho2 e5 jit8ci2/ciok4 li2 kin1a2lit8 cin1ho2 e5 lit8ci2","lan2 e5 ciah8png7"], "TLPA"),
(["piāogè","ǎnlnē/ǎnlnī","gàisiâozzín/gàisiâolín","ziōk lǐ gīnǎzzít zīnhǒ é zzítzǐ/ziōk lǐ gīnǎlít zīnhǒ é lítzǐ","lǎn é ziáhbn̂g"], "Pingyim"),
(["piāugê","anne/anni","gàisiâurĭn/gàisiâulĭn","ziok li gīnarīt zīnhor--ē rītzì/ziok li gīnalīt zīnhor--ê lītzì","lan--ē ziâbn̄g"], "Tongiong")
(["piāugê","anne/anni","gàisiâurĭn/gàisiâulĭn","ziok li gīnarīt zīnhòr--e̊ rītzì/ziok li gīnalīt zīnhòr--e̊ lītzì","làn--e̊ ziâbn̄g"], "Tongiong")
]
for transl, system in test_data:
data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_ipa_conversion.py

Large diffs are not rendered by default.

Loading

0 comments on commit 9941098

Please sign in to comment.