Merge pull request #27 from letuananh/main

speach 0.1a9 with TTLv2 API
neocl · May 27, 2021 · 6e4d6c6 · 6e4d6c6
2 parents 703168e + 7f07e1e
commit 6e4d6c6
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 55 deletions.
diff --git a/docs/updates.rst b/docs/updates.rst
@@ -3,6 +3,13 @@
 Speach Changelog
 ================
 
+Speach 0.1a9
+------------
+
+- 2021-05-27
+
+  - Use TTLv2 API (chirptext >= 0.2a4.post1)
+
 Speach 0.1a8
 ------------
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-chirptext >= 0.1, <0.3
+chirptext >= 0.2a4.post1, <0.3
 puchikarui >= 0.1, <0.3
diff --git a/speach/__version__.py b/speach/__version__.py
@@ -14,6 +14,6 @@
 __issue__ = "https://github.com/neocl/speach/issues/"
 __maintainer__ = "Le Tuan Anh"
 __version_major__ = "0.1"  # follow PEP-0440
-__version__ = "{}a8".format(__version_major__)
-__version_long__ = "{} - Alpha 8".format(__version_major__)
+__version__ = "{}a9".format(__version_major__)
+__version_long__ = "{} - Alpha 9".format(__version_major__)
 __status__ = "3 - Alpha"
diff --git a/speach/data/scripts/init_corpus.sql b/speach/data/scripts/init_corpus.sql
@@ -89,9 +89,9 @@ CREATE TABLE IF NOT EXISTS "tag" (
     ,"wid" INTEGER
     ,"cfrom" INTEGER
     ,"cto" INTEGER
-    ,"label" TEXT
+    ,"value" TEXT
     ,"source" TEXT
-    ,"tagtype" TEXT
+    ,"type" TEXT
     ,FOREIGN KEY(sid) REFERENCES sentence(ID) ON DELETE CASCADE ON UPDATE CASCADE
 );
 

diff --git a/speach/sqlite.py b/speach/sqlite.py
@@ -87,8 +87,7 @@ def __init__(self, *args, **kwargs):
                        proto=ttl.Token).set_id('ID')
         self.add_table('concept', ['ID', 'sid', 'cidx', 'clemma', 'tag', 'flag', 'comment'],
                        proto=ttl.Concept).set_id('ID')
-        self.add_table('tag', ['ID', 'sid', 'wid', 'cfrom', 'cto', 'label', 'source', 'tagtype'],
-                       proto=ttl.Tag).set_id('ID')
+        self.add_table('tag', ['ID', 'sid', 'wid', 'cfrom', 'cto', 'value', 'source', 'type'], id_cols="ID")
         self.add_table('cwl', ['sid', 'cid', 'wid'], proto=CWLink)
 
     @with_ctx
@@ -169,23 +168,24 @@ def get_sent(self, sentID, ctx=None):
         for tk in tokens:
             sent.tokens.append(tk)
         # select all tags
-        tags = ctx.tag.select('sid = ?', (sent.ID,))
+        tags = ctx.execute('SELECT * FROM TAG where sid = ?', (sent.ID,))
         for tag in tags:
-            if tag.wid is None:
-                sent.tags.append(tag)
-            elif tag.wid in tokenmap:
-                tokenmap[tag.wid].tags.append(tag)
+            # TODO: Don't use _append internal
+            if tag['wid'] is None:
+                sent.tags.new(**tag)
+            elif tag['wid'] in tokenmap:
+                tokenmap[tag['wid']].tags.new(**tag)
             else:
                 getLogger().warning("Orphan tag in sentence #{}: {}".format(sent.ID, tag))
         # select concepts
         concepts = ctx.concept.select('sid = ?', (sent.ID,))
         conceptmap = {c.ID: c for c in concepts}
         for c in concepts:
-            sent.add_concept(c)
+            sent.concepts._append(c)
         # select cwl
         cwlinks = ctx.cwl.select('sid = ?', (sent.ID,))
         for cwl in cwlinks:
-            conceptmap[cwl.cid].add_token(tokenmap[cwl.wid])
+            conceptmap[cwl.cid].tokens += tokenmap[cwl.wid]
         return sent
 
     @with_ctx

diff --git a/speach/ttlig.py b/speach/ttlig.py
@@ -25,7 +25,7 @@
 
 from chirptext import DataObject, piter
 from chirptext import chio
-from chirptext.deko import is_kana, parse
+from chirptext import deko
 from chirptext import ttl
 
 
@@ -62,20 +62,20 @@ def to_ttl(self):
         data = self.to_dict()
         for l in TTLIG.KNOWN_LABELS:
             if l not in ['text', 'orth', 'tokens'] and l in data and data[l]:
-                ttl_sent.new_tag(data[l], tagtype=l)
+                ttl_sent.tags.new(data[l], type=l)
         if self.tokens:
             _tokens = parse_ruby(self.tokens)
             ttl_sent.tokens = (t.text() for t in _tokens)
             for ttl_token, furi_token in zip(ttl_sent, _tokens):
                 if furi_token.surface != furi_token.text():
-                    ttl_token.new_tag(furi_token.surface, tagtype='furi')
+                    ttl_token.tags.new(furi_token.surface, type='furi')
             if self.morphtrans:
                 _morphtokens = tokenize(self.morphtrans)
                 if len(_morphtokens) != len(ttl_sent):
                     logging.getLogger(__name__).warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
                 else:
                     for t, m in zip(ttl_sent, _morphtokens):
-                        t.new_tag(m, tagtype='mtrans')
+                        t.tags.new(m, type='mtrans')
             if self.pos:
                 _postokens = tokenize(self.pos)
                 if len(_postokens) != len(ttl_sent):
@@ -96,14 +96,14 @@ def to_ttl(self):
                     logging.getLogger(__name__).warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
                 else:
                     for t, m in zip(ttl_sent, _glosstokens):
-                        t.new_tag(m, tagtype='mgloss')
+                        t.tags.new(m, type='mgloss')
             if self.wordgloss:
                 _glosstokens = tokenize(self.wordgloss)
                 if len(_glosstokens) != len(ttl_sent):
                     logging.getLogger(__name__).warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
                 else:
                     for t, m in zip(ttl_sent, _glosstokens):
-                        t.new_tag(m, tagtype='wgloss')
+                        t.tags.new(m, type='wgloss')
         return ttl_sent
 
     def to_expex(self, default_ident=''):
@@ -177,7 +177,7 @@ def tsduration(self):
             return None
         else:
             return self.tsto - self.tsfrom
-        
+
     def overlap(self, other):
         ''' Calculate overlap score between this utterance and another
         Score = 0 means adjacent, score > 0 means overlapped, score < 0 means no overlap (the distance between the two)
@@ -374,14 +374,14 @@ def to_anki(self):
             else:
                 frags.append(str(g))
         return ''.join(frags)
-    
+
     def __str__(self):
         return self.text()
 
     @staticmethod
     def from_furi(surface, kana):
         ruby = RubyToken(surface=surface)
-        if is_kana(surface):
+        if deko.is_kana(surface):
             ruby.append(surface)
             return ruby
         edit_seq = ndiff(surface, kana)
@@ -533,18 +533,24 @@ def mctoken_to_furi(token):
     return RubyToken.from_furi(token.surface, token.reading_hira())
 
 
-def text_to_igrow(txt):
-    ''' Parse text to TTLIG format '''
-    msent = parse(txt)
+def ttl_to_igrow(msent):
+    ''' Convert TTL to TTLIG format '''
     tokens = []
     pos = []
     lemmas = []
     for token in msent:
-        if token.is_eos:
-            continue
-        pos.append(token.pos3())
-        r = RubyToken.from_furi(token.surface, token.reading_hira())
-        tokens.append(r.to_code())
-        lemmas.append(token.root)
-    igrow = IGRow(text=txt, tokens=' '.join(tokens), pos=' '.join(pos), lemma=' '.join(lemmas))
+        pos.append(token.pos3 if token.pos3 else token.pos)
+        if token.reading_hira:
+            r = RubyToken.from_furi(token.text, token.reading_hira)
+            tokens.append(r.to_code())
+        else:
+            tokens.append(token.text)
+            lemmas.append(token.lemma if token.lemma else '')
+    igrow = IGRow(text=msent.text, tokens=' '.join(tokens), pos=' '.join(pos), lemma=' '.join(lemmas))
     return igrow
+
+
+def text_to_igrow(txt):
+    ''' Parse text to TTLIG format '''
+    sent = deko.parse(txt)
+    return ttl_to_igrow(sent)
diff --git a/test/data/test.json b/test/data/test.json
@@ -1,3 +1,3 @@
-{"text": "三毛猫が好きです。", "tokens": [{"cfrom": 0, "cto": 1, "text": "三", "pos": "名詞-数", "tags": {"pos": ["名詞-数"], "Reading": ["さん"]}}, {"cfrom": 1, "cto": 2, "text": "毛", "pos": "名詞-接尾-助数詞", "tags": {"pos": ["名詞-接尾-助数詞"], "Reading": ["もう"]}}, {"cfrom": 2, "cto": 3, "text": "猫", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["ねこ"]}}, {"cfrom": 3, "cto": 4, "text": "が", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": ["が"]}}, {"cfrom": 4, "cto": 6, "text": "好き", "pos": "名詞-形容動詞語幹", "tags": {"pos": ["名詞-形容動詞語幹"], "Reading": ["すき"]}}, {"cfrom": 6, "cto": 8, "text": "です", "pos": "助動詞", "tags": {"pos": ["助動詞"], "Reading": ["です"]}}, {"cfrom": 8, "cto": 9, "text": "。", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": ["。"]}}], "concepts": [{"clemma": "wiki.ja:三毛猫", "tag": "三毛猫", "tokens": [0, 1, 2], "comment": "Calico cat, you know?"}], "ID": 1}
-{"text": "雨が降る。", "tokens": [{"cfrom": 0, "cto": 1, "text": "雨", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["あめ"]}}, {"cfrom": 1, "cto": 2, "text": "が", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": ["が"]}}, {"cfrom": 2, "cto": 4, "text": "降る", "pos": "動詞-自立", "tags": {"pos": ["動詞-自立"], "Reading": ["ふる"]}}, {"cfrom": 4, "cto": 5, "text": "。", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": ["。"]}}], "concepts": [{"clemma": "02756821-v", "tag": "降る", "tokens": [2]}], "ID": 2}
-{"text": "女の子はケーキを食べる。", "tokens": [{"cfrom": 0, "cto": 3, "text": "女の子", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["おんなのこ"]}}, {"cfrom": 3, "cto": 4, "text": "は", "pos": "助詞-係助詞", "tags": {"pos": ["助詞-係助詞"], "Reading": ["は"]}}, {"cfrom": 4, "cto": 7, "text": "ケーキ", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["けーき"]}}, {"cfrom": 7, "cto": 8, "text": "を", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": ["を"]}}, {"cfrom": 8, "cto": 11, "text": "食べる", "pos": "動詞-自立", "comment": "to eat", "tags": {"pos": ["動詞-自立"], "Reading": ["たべる"]}}, {"cfrom": 11, "cto": 12, "text": "。", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": ["。"]}}], "concepts": [{"clemma": "10084295-n", "tag": "女の子", "tokens": [0]}, {"clemma": "01166351-v", "tag": "食べる", "tokens": [4]}], "ID": 3}
+{"text": "三毛猫が好きです。", "tokens": [{"cfrom": 0, "cto": 1, "text": "三", "lemma": "三", "pos": "名詞", "tags": [{"value": "数", "type": "sc1"}, {"value": "名詞-数", "type": "pos3"}, {"value": "さん", "type": "reading_hira"}]}, {"cfrom": 1, "cto": 2, "text": "毛", "lemma": "毛", "pos": "名詞", "tags": [{"value": "接尾", "type": "sc1"}, {"value": "助数詞", "type": "sc2"}, {"value": "名詞-接尾-助数詞", "type": "pos3"}, {"value": "もう", "type": "reading_hira"}]}, {"cfrom": 2, "cto": 3, "text": "猫", "lemma": "猫", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "ねこ", "type": "reading_hira"}]}, {"cfrom": 3, "cto": 4, "text": "が", "lemma": "が", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "が", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 6, "text": "好き", "lemma": "好き", "pos": "名詞", "tags": [{"value": "形容動詞語幹", "type": "sc1"}, {"value": "名詞-形容動詞語幹", "type": "pos3"}, {"value": "すき", "type": "reading_hira"}]}, {"cfrom": 6, "cto": 8, "text": "です", "lemma": "です", "pos": "助動詞", "tags": [{"value": "特殊・デス", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "助動詞", "type": "pos3"}, {"value": "です", "type": "reading_hira"}]}, {"cfrom": 8, "cto": 9, "text": "。", "lemma": "。", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "。", "type": "reading_hira"}]}], "ID": "1"}
+{"text": "雨が降る。", "tokens": [{"cfrom": 0, "cto": 1, "text": "雨", "lemma": "雨", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "あめ", "type": "reading_hira"}]}, {"cfrom": 1, "cto": 2, "text": "が", "lemma": "が", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "が", "type": "reading_hira"}]}, {"cfrom": 2, "cto": 4, "text": "降る", "lemma": "降る", "pos": "動詞", "tags": [{"value": "自立", "type": "sc1"}, {"value": "五段・ラ行", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "動詞-自立", "type": "pos3"}, {"value": "ふる", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 5, "text": "。", "lemma": "。", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "。", "type": "reading_hira"}]}], "ID": "2"}
+{"text": "女の子はケーキを食べる。", "tokens": [{"cfrom": 0, "cto": 3, "text": "女の子", "lemma": "女の子", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "おんなのこ", "type": "reading_hira"}]}, {"cfrom": 3, "cto": 4, "text": "は", "lemma": "は", "pos": "助詞", "tags": [{"value": "係助詞", "type": "sc1"}, {"value": "助詞-係助詞", "type": "pos3"}, {"value": "は", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 7, "text": "ケーキ", "lemma": "ケーキ", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "けーき", "type": "reading_hira"}]}, {"cfrom": 7, "cto": 8, "text": "を", "lemma": "を", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "を", "type": "reading_hira"}]}, {"cfrom": 8, "cto": 11, "text": "食べる", "lemma": "食べる", "pos": "動詞", "tags": [{"value": "自立", "type": "sc1"}, {"value": "一段", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "動詞-自立", "type": "pos3"}, {"value": "たべる", "type": "reading_hira"}]}, {"cfrom": 11, "cto": 12, "text": "。", "lemma": "。", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "。", "type": "reading_hira"}]}], "ID": "3"}
diff --git a/test/test_tig.py b/test/test_tig.py
@@ -17,7 +17,6 @@
 
 from chirptext import chio
 from chirptext import deko
-from chirptext import dekomecab
 
 from speach import ttl
 from speach import ttlig
@@ -36,9 +35,10 @@
 TRANSCRIPT_FILE = os.path.join(TEST_DIR, 'data', 'test_transcript.tab')
 TRANSCRIPT_EXPECTED_FILE = os.path.join(TEST_DIR, 'data', 'test_transcript.human.tab')
 
-_MECAB_VERSION = None
+_CAN_PARSE_JP = None
 try:
-    _MECAB_VERSION = dekomecab.version()
+    engines = deko.engines()
+    _CAN_PARSE_JP = len(engines)
 except Exception:
     pass
 
@@ -62,9 +62,9 @@ def test_tokenizer(self):
         glosses = ttlig.tokenize(gloss_string)
         sent.tokens = tokens
         for tk, gl in zip(sent.tokens, glosses):
-            tk.new_tag(gl, tagtype='gloss')
+            tk.tag.gloss = gl
         # verify imported information
-        actual = [(t.text, t.get_tag('gloss').label) for t in sent]
+        actual = [(t.text, t.tag.gloss.value) for t in sent]
         expected = [('It', 'SUBJ'), ('works', 'work'), ('.', 'PUNC')]
         self.assertEqual(expected, actual)
 
@@ -220,18 +220,18 @@ def test_read_invalid_ttlig(self):
         invalid_file = os.path.join(TEST_DIR, 'data', 'testig_invalid.txt')
         self.assertRaises(Exception, lambda: ttlig.read(invalid_file))
 
-    @unittest.skipIf(not _MECAB_VERSION, "Deko is not available, test_make_furi_token is skipped.")
+    @unittest.skipIf(not _CAN_PARSE_JP, "Deko is not available, test_make_furi_token is skipped.")
     def test_make_furi_token(self):
         s = deko.parse('友達')
         # f = ttlig.mctoken_to_furi(s[0])
-        f = ttlig.RubyToken.from_furi(s[0].surface, s[0].reading_hira())
+        f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
         self.assertEqual(f.to_code(), '{友達/ともだち}')
         # half-width char
         s = deko.parse('0')
-        f = ttlig.RubyToken.from_furi(s[0].surface, s[0].reading_hira())
+        f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
         self.assertEqual(f.to_code(), '0')
 
-    @unittest.skipIf(not _MECAB_VERSION, "Deko is not available, test_tig_parsing is skipped.")
+    @unittest.skipIf(not _CAN_PARSE_JP, "Deko is not available, test_tig_parsing is skipped.")
     def test_tig_parsing(self):
         igrow = ttlig.text_to_igrow('友達と巡り会った。')
         self.assertEqual(igrow.text, '友達と巡り会った。')
@@ -251,6 +251,19 @@ def test_tig_parsing(self):
         igrow = ttlig.text_to_igrow('0時だ。')
         self.assertEqual(igrow.text, '0時だ。')
         self.assertEqual(igrow.tokens, '0 {時/じ} だ 。')
+        # test export to ttl
+        ttl_sent = igrow.to_ttl()
+        expected = {'tags': [{'type': 'pos', 'value': '名詞-数 名詞-接尾-助数詞 助動詞 記号-句点'}],
+                    'text': '0時だ。',
+                    'tokens': [{'cfrom': 0, 'cto': 1, 'pos': '名詞-数', 'text': '0'},
+                               {'cfrom': 1,
+                                'cto': 2,
+                                'pos': '名詞-接尾-助数詞',
+                                'tags': [{'type': 'furi', 'value': '{時/じ}'}],
+                                'text': '時'},
+                               {'cfrom': 2, 'cto': 3, 'pos': '助動詞', 'text': 'だ'},
+                               {'cfrom': 3, 'cto': 4, 'pos': '記号-句点', 'text': '。'}]}
+        self.assertEqual(expected, ttl_sent.to_dict())
 
     def test_parsing_aligned_text(self):
         print("Testing TTLIG with multiple spaces")