Merge pull request #15 from andreihar/14-transliteration-conversion

14 transliteration conversion
andreihar · Apr 26, 2024 · 9941098 · 9941098
2 parents d3dec16 + 2283a60
commit 9941098
Show file tree

Hide file tree

Showing 11 changed files with 134 additions and 52 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -8,7 +8,7 @@ jobs:
       fail-fast: false
       matrix:
         platform: [ubuntu-latest, macos-latest, windows-latest]
-        python: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12-dev']
+        python: ['3.8', '3.9', '3.10', '3.11', '3.12-dev']
     name: Test
     runs-on: ${{ matrix.platform }}
     steps:

diff --git a/taibun/data/words.json b/taibun/data/words.json
@@ -8,7 +8,7 @@
 	"諾": "--hioh",
 	"乎": "--honnh",
 	"唷": "--ioh",
-	"咧": "--leh",
+	"咧": "leh",
 	"喔": "--ooh",
 	"嘖": "--tsheh",
 	"的": "--ê",
@@ -2155,7 +2155,7 @@
 	"呃酸": "eh-sng",
 	"厄運": "eh-ūn",
 	"嬰仔名": "enn-á-miâ/inn-á-miâ",
-	"嬰仔": "enn-á/inn--á",
+	"嬰仔": "enn-á/inn-á",
 	"嬰": "enn/inn",
 	"狹小": "e̍h-sió/ue̍h-sió",
 	"狹細": "e̍h-sè/ue̍h-suè",

diff --git a/taibun/taibun.py b/taibun/taibun.py
@@ -6,12 +6,13 @@
 """
 Description: Converts Chinese characters to Taiwanese Hokkien phonetic transcriptions.
              Supports both Traditional and Simplified characters.
-Invariant: dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
-           system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
+Invariant: system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
+           dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
            format = `mark` (diacritical), `number` (numeric), `strip` (no tones)
-           sandhi = True, False
            delimiter = String that replaces the default delimiter
+           sandhi = `auto`, `none`, `exc_last`, `incl_last`
            punctuation = `format` (Latin-style, default), `none` (preserve original)
+           convert_non_cjk = True, False
 """
 
 
@@ -41,14 +42,17 @@ class Converter(object):
     tt = '[ТŊ_ТКŊ]'
     DEFAULT_DELIMITER = object()
     DEFAULT_SANDHI = object()
+    __suffixes = ['啊','矣','喂','欸','唅','嘿','諾','乎','唷','喔','嘖','的']
+    __no_sandhi = ['這','彼','遮','遐']
 
-    def __init__(self, system='Tailo', dialect='south', format='mark', delimiter=DEFAULT_DELIMITER, sandhi=DEFAULT_SANDHI, punctuation='format'):
+    def __init__(self, system='Tailo', dialect='south', format='mark', delimiter=DEFAULT_DELIMITER, sandhi=DEFAULT_SANDHI, punctuation='format', convert_non_cjk=False):
         self.system = system.lower()
         self.dialect = dialect.lower()
         self.format = format
         self.delimiter = delimiter
         self.sandhi = sandhi
         self.punctuation = punctuation
+        self.convert_non_cjk = convert_non_cjk
 
 
     ### Interface functions
@@ -69,12 +73,13 @@ def get(self, input):
 
     # Helper to convert separate words
     def __convert_tokenised(self, word):
-        if word[0] not in word_dict:
+        if word[0] in word_dict:
+            word = (word_dict[word[0]],) + word[1:]
+            if "/" in word[0]:
+                dialect_part = word[0].split("/")[1] if self.dialect == 'north' else word[0].split("/")[0]
+                word = (dialect_part,) + word[1:]
+        elif not self.convert_non_cjk:
             return word[0]
-        word = (word_dict[word[0]],) + word[1:]
-        if "/" in word[0]:
-            dialect_part = word[0].split("/")[1] if self.dialect == 'north' else word[0].split("/")[0]
-            word = (dialect_part,) + word[1:]
         word = self.__system_conversion(word).replace('---', '--')
         if self.format == 'number' and self.system in ['tailo', 'poj']:
             word = self.__mark_to_number(word)
@@ -97,7 +102,7 @@ def __system_conversion(self, word):
         if self.system == 'pingyim': return self.__tailo_to_pingyim(word)
         if self.system == 'tongiong': return self.__tailo_to_ti(word)
         if self.system == 'ipa': return self.__tailo_to_ipa(word)
-        if self.sandhi: return self.__tailo_to_tailo(word)
+        if self.sandhi in ['auto', 'exc_last', 'incl_last']: return self.__tailo_to_tailo(word)
         else: return word[0]
 
 
@@ -110,8 +115,8 @@ def __set_default_delimiter(self):
 
     # Helper functions to set sandhi according to transliteration system if wasn't explicitly defined by user
     def __set_default_sandhi(self):
-        if self.system == 'tongiong': return True
-        return False
+        if self.system == 'tongiong': return 'auto'
+        return 'none'
 
 
     ### Conversion functions
@@ -120,8 +125,12 @@ def __set_default_sandhi(self):
     def __get_number_tones(self, input):
         words = self.__preprocess_word(input[0])
         number_tones = [self.__get_number_tone(w) for w in words if len(w) > 0]
-        if self.sandhi:
-            number_tones = self.__tone_sandhi(number_tones, input[1])
+        if self.sandhi in ['auto', 'exc_last', 'incl_last'] or self.format == 'number':
+            replace_with_zero = False
+            number_tones = [s[:-1] + '0' if replace_with_zero or (replace_with_zero := s[-1] == '0') else s for s in number_tones]
+        if self.sandhi in ['auto', 'exc_last', 'incl_last']:
+            index = next((i for i, s in enumerate(number_tones) if s.startswith(self.suffix_token)), len(number_tones))
+            number_tones = self.__tone_sandhi(number_tones[:index], False) + number_tones[index:] if len(number_tones) != index and len(number_tones) > 1 else self.__tone_sandhi(number_tones, input[1])
         return number_tones
 
 
@@ -149,7 +158,7 @@ def __get_number_tone(self, input):
         elif re.search('̍', input): input += '8'
         elif input[-1] in finals: input += '4'
         else: input += '1'
-        if input[0] == '+' and input[-1] == '4':
+        if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi in ['auto', 'exc_last', 'incl_last'] or self.format == 'number'):
             input = input[:-1] + '0'
         input = "".join(c for c in unicodedata.normalize("NFD", input) if unicodedata.category(c) != "Mn")
         return input
@@ -164,9 +173,9 @@ def __preprocess_word(self, word):
     def __get_mark_tone(self, input, placement, tones):
         for s in placement:
             if s.replace(self.tt, '') in input:
-                part = s
+                input = input.replace(s.replace(self.tt, ''), s.replace(self.tt, tones[int(input[-1])]))
                 break
-        return unicodedata.normalize('NFC', input.replace(part.replace(self.tt, ''), part.replace(self.tt, tones[int(input[-1])]))[:-1])
+        return unicodedata.normalize('NFC', input[:-1])
 
 
     # Helper to apply tone sandhi to a word
@@ -183,9 +192,14 @@ def __tone_sandhi(self, words, last):
 
     # Helper to define which words should be sandhi'd fully
     def __tone_sandhi_position(self, input):
-        result_list = []
-        for i, char in enumerate(input):
-            result_list.append((char, (i < len(input) - 1 and is_cjk(input[i+1]))))
+        sandhi_logic = {
+            'exc_last': [(char, False if i == len(input) - 1 else True) for i, char in enumerate(input)],
+            'incl_last': [(char, True) for char in input],
+        }
+        result_list = sandhi_logic.get(self.sandhi, [(char, False if char in self.__no_sandhi else (i < len(input) - 1 and is_cjk(input[i+1]))) for i, char in enumerate(input)])
+        for i in range(len(result_list) - 2, -1, -1):
+            if result_list[i+1][0] in self.__suffixes:
+                result_list[i] = (result_list[i][0], False) 
         return result_list
 
 
@@ -224,7 +238,7 @@ def __tailo_to_poj(self, input):
     # Helper to convert syllable from Tai-lo to 方音符號 (zhuyin)
     def __tailo_to_zhuyin(self, input):
         convert = {
-            'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8',
+            'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8', 'h0': '0',
             'tshing':'ㄑㄧㄥ', 'tshinn':'ㄑㆪ', 'phing':'ㄆㄧㄥ', 'phinn':'ㄆㆪ', 'tsing':'ㄐㄧㄥ', 'tsinn':'ㄐㆪ',
             'ainn':'ㆮ', 'aunn':'ㆯ', 'giok':'ㆣㄧㄜㆶ', 'ngai':'ㄫㄞ', 'ngau':'ㄫㄠ', 'ngoo':'ㄫㆦ', 'ping':'ㄅㄧㄥ',
             'pinn':'ㄅㆪ', 'senn':'ㄙㆥ', 'sing':'ㄒㄧㄥ', 'sinn':'ㄒㆪ', 'tshi':'ㄑㄧ',
@@ -334,7 +348,7 @@ def __tailo_to_ipa(self, input):
         if self.dialect == 'north':
             convert.update({'o':'o'})
         convert2 = {
-            'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi'}
+            'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi','h0':'ʔ0'}
         tones = ['', '⁴⁴', '⁵³', '¹¹', '²¹', '²⁵', '', '²²', '⁵'] if self.dialect != 'north' else ['', '⁵⁵', '⁵¹', '²¹', '³²', '²⁴', '', '³³', '⁴']
         convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
         convert2.update({k.capitalize(): v.capitalize() for k, v in convert2.items()})

diff --git a/tests/test_delimiter.py b/tests/test_delimiter.py
@@ -10,7 +10,7 @@ def test_default():
 		(["ㄆㄧㄠ ㄍㆤ˪","ㄢˋ ㄋㆤ/ㄢˋ ㄋㄧ","ㄍㄞ˪ ㄒㄧㄠ˫ ㆢㄧㄣˊ/ㄍㄞ˪ ㄒㄧㄠ˫ ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㆢㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㄌㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ ㄅㆭ˫"], "Zhuyin"),
 		(["phiau1 ke3","an2 ne1/an2 ni1","kai3 siau7 jin5/kai3 siau7 lin5","ciok4 li2 kin1 a2 jit8 cin1 ho2 e5 jit8 ci2/ciok4 li2 kin1 a2 lit8 cin1 ho2 e5 lit8 ci2","lan2 e5 ciah8 png7"], "TLPA"),
 		(["piāogè","ǎnlnē/ǎnlnī","gàisiâozzín/gàisiâolín","ziōk lǐ gīnǎzzít zīnhǒ é zzítzǐ/ziōk lǐ gīnǎlít zīnhǒ é lítzǐ","lǎn é ziáhbn̂g"], "Pingyim"),
-		(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hor--ē rīt-zì/ziok li gīn-a-līt zīn-hor--ê līt-zì","lan--ē ziâ-bn̄g"], "Tongiong")
+		(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hòr--e̊ rīt-zì/ziok li gīn-a-līt zīn-hòr--e̊ līt-zì","làn--e̊ ziâ-bn̄g"], "Tongiong")
     ]
 	for transl, system in test_data:
 		data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
@@ -23,7 +23,7 @@ def test_hyphen():
 		(["ㄆㄧㄠ-ㄍㆤ˪","ㄢˋ-ㄋㆤ/ㄢˋ-ㄋㄧ","ㄍㄞ˪-ㄒㄧㄠ˫-ㆢㄧㄣˊ/ㄍㄞ˪-ㄒㄧㄠ˫-ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ-ㄚˋ-ㆢㄧㆵ˙ ㄐㄧㄣ-ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙-ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ-ㄚˋ-ㄌㄧㆵ˙ ㄐㄧㄣ-ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙-ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙-ㄅㆭ˫"], "Zhuyin"),
 		(["phiau1-ke3","an2-ne1/an2-ni1","kai3-siau7-jin5/kai3-siau7-lin5","ciok4 li2 kin1-a2-jit8 cin1-ho2 e5 jit8-ci2/ciok4 li2 kin1-a2-lit8 cin1-ho2 e5 lit8-ci2","lan2 e5 ciah8-png7"], "TLPA"),
 		(["piāo-gè","ǎn-lnē/ǎn-lnī","gài-siâo-zzín/gài-siâo-lín","ziōk lǐ gīn-ǎ-zzít zīn-hǒ é zzít-zǐ/ziōk lǐ gīn-ǎ-lít zīn-hǒ é lít-zǐ","lǎn é ziáh-bn̂g"], "Pingyim"),
-		(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hor--ē rīt-zì/ziok li gīn-a-līt zīn-hor--ê līt-zì","lan--ē ziâ-bn̄g"], "Tongiong")
+		(["piāu-gê","an-ne/an-ni","gài-siâu-rĭn/gài-siâu-lĭn","ziok li gīn-a-rīt zīn-hòr--e̊ rīt-zì/ziok li gīn-a-līt zīn-hòr--e̊ līt-zì","làn--e̊ ziâ-bn̄g"], "Tongiong")
 	]
 	for transl, system in test_data:
 		data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
@@ -36,7 +36,7 @@ def test_space():
 		(["ㄆㄧㄠ ㄍㆤ˪","ㄢˋ ㄋㆤ/ㄢˋ ㄋㄧ","ㄍㄞ˪ ㄒㄧㄠ˫ ㆢㄧㄣˊ/ㄍㄞ˪ ㄒㄧㄠ˫ ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㆢㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣ ㄚˋ ㄌㄧㆵ˙ ㄐㄧㄣ ㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ ㄅㆭ˫"], "Zhuyin"),
 		(["phiau1 ke3","an2 ne1/an2 ni1","kai3 siau7 jin5/kai3 siau7 lin5","ciok4 li2 kin1 a2 jit8 cin1 ho2 e5 jit8 ci2/ciok4 li2 kin1 a2 lit8 cin1 ho2 e5 lit8 ci2","lan2 e5 ciah8 png7"], "TLPA"),
 		(["piāo gè","ǎn lnē/ǎn lnī","gài siâo zzín/gài siâo lín","ziōk lǐ gīn ǎ zzít zīn hǒ é zzít zǐ/ziōk lǐ gīn ǎ lít zīn hǒ é lít zǐ","lǎn é ziáh bn̂g"], "Pingyim"),
-		(["piāu gê","an ne/an ni","gài siâu rĭn/gài siâu lĭn","ziok li gīn a rīt zīn hor--ē rīt zì/ziok li gīn a līt zīn hor--ê līt zì","lan--ē ziâ bn̄g"], "Tongiong")
+		(["piāu gê","an ne/an ni","gài siâu rĭn/gài siâu lĭn","ziok li gīn a rīt zīn hòr--e̊ rīt zì/ziok li gīn a līt zīn hòr--e̊ līt zì","làn--e̊ ziâ bn̄g"], "Tongiong")
 	]
 	for transl, system in test_data:
 		data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
@@ -49,7 +49,7 @@ def test_nospace():
 		(["ㄆㄧㄠㄍㆤ˪","ㄢˋㄋㆤ/ㄢˋㄋㄧ","ㄍㄞ˪ㄒㄧㄠ˫ㆢㄧㄣˊ/ㄍㄞ˪ㄒㄧㄠ˫ㄌㄧㄣˊ","ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣㄚˋㆢㄧㆵ˙ ㄐㄧㄣㄏㄜˋ ㆤˊ ㆢㄧㆵ˙ㄐㄧˋ/ㄐㄧㆦㆶ ㄌㄧˋ ㄍㄧㄣㄚˋㄌㄧㆵ˙ ㄐㄧㄣㄏㄜˋ ㆤˊ ㄌㄧㆵ˙ㄐㄧˋ","ㄌㄢˋ ㆤˊ ㄐㄧㄚㆷ˙ㄅㆭ˫"], "Zhuyin"),
 		(["phiau1ke3","an2ne1/an2ni1","kai3siau7jin5/kai3siau7lin5","ciok4 li2 kin1a2jit8 cin1ho2 e5 jit8ci2/ciok4 li2 kin1a2lit8 cin1ho2 e5 lit8ci2","lan2 e5 ciah8png7"], "TLPA"),
 		(["piāogè","ǎnlnē/ǎnlnī","gàisiâozzín/gàisiâolín","ziōk lǐ gīnǎzzít zīnhǒ é zzítzǐ/ziōk lǐ gīnǎlít zīnhǒ é lítzǐ","lǎn é ziáhbn̂g"], "Pingyim"),
-		(["piāugê","anne/anni","gàisiâurĭn/gàisiâulĭn","ziok li gīnarīt zīnhor--ē rītzì/ziok li gīnalīt zīnhor--ê lītzì","lan--ē ziâbn̄g"], "Tongiong")
+		(["piāugê","anne/anni","gàisiâurĭn/gàisiâulĭn","ziok li gīnarīt zīnhòr--e̊ rītzì/ziok li gīnalīt zīnhòr--e̊ lītzì","làn--e̊ ziâbn̄g"], "Tongiong")
 	]
 	for transl, system in test_data:
 		data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]

diff --git a/tests/test_ipa_conversion.py b/tests/test_ipa_conversion.py