Skip to content

Commit

Permalink
Merge pull request #27 from letuananh/main
Browse files Browse the repository at this point in the history
speach 0.1a9 with TTLv2 API
  • Loading branch information
letuananh authored May 27, 2021
2 parents 703168e + 7f07e1e commit 6e4d6c6
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 55 deletions.
7 changes: 7 additions & 0 deletions docs/updates.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
Speach Changelog
================

Speach 0.1a9
------------

- 2021-05-27

- Use TTLv2 API (chirptext >= 0.2a4.post1)

Speach 0.1a8
------------

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chirptext >= 0.1, <0.3
chirptext >= 0.2a4.post1, <0.3
puchikarui >= 0.1, <0.3
4 changes: 2 additions & 2 deletions speach/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
__issue__ = "https://github.com/neocl/speach/issues/"
__maintainer__ = "Le Tuan Anh"
__version_major__ = "0.1" # follow PEP-0440
__version__ = "{}a8".format(__version_major__)
__version_long__ = "{} - Alpha 8".format(__version_major__)
__version__ = "{}a9".format(__version_major__)
__version_long__ = "{} - Alpha 9".format(__version_major__)
__status__ = "3 - Alpha"
4 changes: 2 additions & 2 deletions speach/data/scripts/init_corpus.sql
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ CREATE TABLE IF NOT EXISTS "tag" (
,"wid" INTEGER
,"cfrom" INTEGER
,"cto" INTEGER
,"label" TEXT
,"value" TEXT
,"source" TEXT
,"tagtype" TEXT
,"type" TEXT
,FOREIGN KEY(sid) REFERENCES sentence(ID) ON DELETE CASCADE ON UPDATE CASCADE
);

Expand Down
18 changes: 9 additions & 9 deletions speach/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ def __init__(self, *args, **kwargs):
proto=ttl.Token).set_id('ID')
self.add_table('concept', ['ID', 'sid', 'cidx', 'clemma', 'tag', 'flag', 'comment'],
proto=ttl.Concept).set_id('ID')
self.add_table('tag', ['ID', 'sid', 'wid', 'cfrom', 'cto', 'label', 'source', 'tagtype'],
proto=ttl.Tag).set_id('ID')
self.add_table('tag', ['ID', 'sid', 'wid', 'cfrom', 'cto', 'value', 'source', 'type'], id_cols="ID")
self.add_table('cwl', ['sid', 'cid', 'wid'], proto=CWLink)

@with_ctx
Expand Down Expand Up @@ -169,23 +168,24 @@ def get_sent(self, sentID, ctx=None):
for tk in tokens:
sent.tokens.append(tk)
# select all tags
tags = ctx.tag.select('sid = ?', (sent.ID,))
tags = ctx.execute('SELECT * FROM TAG where sid = ?', (sent.ID,))
for tag in tags:
if tag.wid is None:
sent.tags.append(tag)
elif tag.wid in tokenmap:
tokenmap[tag.wid].tags.append(tag)
# TODO: Don't use _append internal
if tag['wid'] is None:
sent.tags.new(**tag)
elif tag['wid'] in tokenmap:
tokenmap[tag['wid']].tags.new(**tag)
else:
getLogger().warning("Orphan tag in sentence #{}: {}".format(sent.ID, tag))
# select concepts
concepts = ctx.concept.select('sid = ?', (sent.ID,))
conceptmap = {c.ID: c for c in concepts}
for c in concepts:
sent.add_concept(c)
sent.concepts._append(c)
# select cwl
cwlinks = ctx.cwl.select('sid = ?', (sent.ID,))
for cwl in cwlinks:
conceptmap[cwl.cid].add_token(tokenmap[cwl.wid])
conceptmap[cwl.cid].tokens += tokenmap[cwl.wid]
return sent

@with_ctx
Expand Down
44 changes: 25 additions & 19 deletions speach/ttlig.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from chirptext import DataObject, piter
from chirptext import chio
from chirptext.deko import is_kana, parse
from chirptext import deko
from chirptext import ttl


Expand Down Expand Up @@ -62,20 +62,20 @@ def to_ttl(self):
data = self.to_dict()
for l in TTLIG.KNOWN_LABELS:
if l not in ['text', 'orth', 'tokens'] and l in data and data[l]:
ttl_sent.new_tag(data[l], tagtype=l)
ttl_sent.tags.new(data[l], type=l)
if self.tokens:
_tokens = parse_ruby(self.tokens)
ttl_sent.tokens = (t.text() for t in _tokens)
for ttl_token, furi_token in zip(ttl_sent, _tokens):
if furi_token.surface != furi_token.text():
ttl_token.new_tag(furi_token.surface, tagtype='furi')
ttl_token.tags.new(furi_token.surface, type='furi')
if self.morphtrans:
_morphtokens = tokenize(self.morphtrans)
if len(_morphtokens) != len(ttl_sent):
logging.getLogger(__name__).warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _morphtokens):
t.new_tag(m, tagtype='mtrans')
t.tags.new(m, type='mtrans')
if self.pos:
_postokens = tokenize(self.pos)
if len(_postokens) != len(ttl_sent):
Expand All @@ -96,14 +96,14 @@ def to_ttl(self):
logging.getLogger(__name__).warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _glosstokens):
t.new_tag(m, tagtype='mgloss')
t.tags.new(m, type='mgloss')
if self.wordgloss:
_glosstokens = tokenize(self.wordgloss)
if len(_glosstokens) != len(ttl_sent):
logging.getLogger(__name__).warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _glosstokens):
t.new_tag(m, tagtype='wgloss')
t.tags.new(m, type='wgloss')
return ttl_sent

def to_expex(self, default_ident=''):
Expand Down Expand Up @@ -177,7 +177,7 @@ def tsduration(self):
return None
else:
return self.tsto - self.tsfrom

def overlap(self, other):
''' Calculate overlap score between this utterance and another
Score = 0 means adjacent, score > 0 means overlapped, score < 0 means no overlap (the distance between the two)
Expand Down Expand Up @@ -374,14 +374,14 @@ def to_anki(self):
else:
frags.append(str(g))
return ''.join(frags)

def __str__(self):
return self.text()

@staticmethod
def from_furi(surface, kana):
ruby = RubyToken(surface=surface)
if is_kana(surface):
if deko.is_kana(surface):
ruby.append(surface)
return ruby
edit_seq = ndiff(surface, kana)
Expand Down Expand Up @@ -533,18 +533,24 @@ def mctoken_to_furi(token):
return RubyToken.from_furi(token.surface, token.reading_hira())


def text_to_igrow(txt):
''' Parse text to TTLIG format '''
msent = parse(txt)
def ttl_to_igrow(msent):
''' Convert TTL to TTLIG format '''
tokens = []
pos = []
lemmas = []
for token in msent:
if token.is_eos:
continue
pos.append(token.pos3())
r = RubyToken.from_furi(token.surface, token.reading_hira())
tokens.append(r.to_code())
lemmas.append(token.root)
igrow = IGRow(text=txt, tokens=' '.join(tokens), pos=' '.join(pos), lemma=' '.join(lemmas))
pos.append(token.pos3 if token.pos3 else token.pos)
if token.reading_hira:
r = RubyToken.from_furi(token.text, token.reading_hira)
tokens.append(r.to_code())
else:
tokens.append(token.text)
lemmas.append(token.lemma if token.lemma else '')
igrow = IGRow(text=msent.text, tokens=' '.join(tokens), pos=' '.join(pos), lemma=' '.join(lemmas))
return igrow


def text_to_igrow(txt):
''' Parse text to TTLIG format '''
sent = deko.parse(txt)
return ttl_to_igrow(sent)
6 changes: 3 additions & 3 deletions test/data/test.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"text": "三毛猫が好きです。", "tokens": [{"cfrom": 0, "cto": 1, "text": "三", "pos": "名詞-数", "tags": {"pos": ["名詞-数"], "Reading": ["さん"]}}, {"cfrom": 1, "cto": 2, "text": "毛", "pos": "名詞-接尾-助数詞", "tags": {"pos": ["名詞-接尾-助数詞"], "Reading": ["もう"]}}, {"cfrom": 2, "cto": 3, "text": "猫", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["ねこ"]}}, {"cfrom": 3, "cto": 4, "text": "が", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": ["が"]}}, {"cfrom": 4, "cto": 6, "text": "好き", "pos": "名詞-形容動詞語幹", "tags": {"pos": ["名詞-形容動詞語幹"], "Reading": ["すき"]}}, {"cfrom": 6, "cto": 8, "text": "です", "pos": "助動詞", "tags": {"pos": ["助動詞"], "Reading": ["です"]}}, {"cfrom": 8, "cto": 9, "text": "。", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": ["。"]}}], "concepts": [{"clemma": "wiki.ja:三毛猫", "tag": "三毛猫", "tokens": [0, 1, 2], "comment": "Calico cat, you know?"}], "ID": 1}
{"text": "雨が降る。", "tokens": [{"cfrom": 0, "cto": 1, "text": "", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["あめ"]}}, {"cfrom": 1, "cto": 2, "text": "", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": [""]}}, {"cfrom": 2, "cto": 4, "text": "降る", "pos": "動詞-自立", "tags": {"pos": ["動詞-自立"], "Reading": ["ふる"]}}, {"cfrom": 4, "cto": 5, "text": "", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": [""]}}], "concepts": [{"clemma": "02756821-v", "tag": "降る", "tokens": [2]}], "ID": 2}
{"text": "女の子はケーキを食べる。", "tokens": [{"cfrom": 0, "cto": 3, "text": "女の子", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["おんなのこ"]}}, {"cfrom": 3, "cto": 4, "text": "", "pos": "助詞-係助詞", "tags": {"pos": ["助詞-係助詞"], "Reading": [""]}}, {"cfrom": 4, "cto": 7, "text": "ケーキ", "pos": "名詞-一般", "tags": {"pos": ["名詞-一般"], "Reading": ["けーき"]}}, {"cfrom": 7, "cto": 8, "text": "", "pos": "助詞-格助詞-一般", "tags": {"pos": ["助詞-格助詞-一般"], "Reading": [""]}}, {"cfrom": 8, "cto": 11, "text": "食べる", "pos": "動詞-自立", "comment": "to eat", "tags": {"pos": ["動詞-自立"], "Reading": ["たべる"]}}, {"cfrom": 11, "cto": 12, "text": "", "pos": "記号-句点", "tags": {"pos": ["記号-句点"], "Reading": [""]}}], "concepts": [{"clemma": "10084295-n", "tag": "女の子", "tokens": [0]}, {"clemma": "01166351-v", "tag": "食べる", "tokens": [4]}], "ID": 3}
{"text": "三毛猫が好きです。", "tokens": [{"cfrom": 0, "cto": 1, "text": "三", "lemma": "三", "pos": "名詞", "tags": [{"value": "数", "type": "sc1"}, {"value": "名詞-数", "type": "pos3"}, {"value": "さん", "type": "reading_hira"}]}, {"cfrom": 1, "cto": 2, "text": "毛", "lemma": "毛", "pos": "名詞", "tags": [{"value": "接尾", "type": "sc1"}, {"value": "助数詞", "type": "sc2"}, {"value": "名詞-接尾-助数詞", "type": "pos3"}, {"value": "もう", "type": "reading_hira"}]}, {"cfrom": 2, "cto": 3, "text": "猫", "lemma": "猫", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "ねこ", "type": "reading_hira"}]}, {"cfrom": 3, "cto": 4, "text": "が", "lemma": "が", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "が", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 6, "text": "好き", "lemma": "好き", "pos": "名詞", "tags": [{"value": "形容動詞語幹", "type": "sc1"}, {"value": "名詞-形容動詞語幹", "type": "pos3"}, {"value": "すき", "type": "reading_hira"}]}, {"cfrom": 6, "cto": 8, "text": "です", "lemma": "です", "pos": "助動詞", "tags": [{"value": "特殊・デス", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "助動詞", "type": "pos3"}, {"value": "です", "type": "reading_hira"}]}, {"cfrom": 8, "cto": 9, "text": "。", "lemma": "。", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "。", "type": "reading_hira"}]}], "ID": "1"}
{"text": "雨が降る。", "tokens": [{"cfrom": 0, "cto": 1, "text": "", "lemma": "", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "あめ", "type": "reading_hira"}]}, {"cfrom": 1, "cto": 2, "text": "", "lemma": "", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "", "type": "reading_hira"}]}, {"cfrom": 2, "cto": 4, "text": "降る", "lemma": "降る", "pos": "動詞", "tags": [{"value": "自立", "type": "sc1"}, {"value": "五段・ラ行", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "動詞-自立", "type": "pos3"}, {"value": "ふる", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 5, "text": "", "lemma": "", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "", "type": "reading_hira"}]}], "ID": "2"}
{"text": "女の子はケーキを食べる。", "tokens": [{"cfrom": 0, "cto": 3, "text": "女の子", "lemma": "女の子", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "おんなのこ", "type": "reading_hira"}]}, {"cfrom": 3, "cto": 4, "text": "は", "lemma": "は", "pos": "助詞", "tags": [{"value": "係助詞", "type": "sc1"}, {"value": "助詞-係助詞", "type": "pos3"}, {"value": "は", "type": "reading_hira"}]}, {"cfrom": 4, "cto": 7, "text": "ケーキ", "lemma": "ケーキ", "pos": "名詞", "tags": [{"value": "一般", "type": "sc1"}, {"value": "名詞-一般", "type": "pos3"}, {"value": "けーき", "type": "reading_hira"}]}, {"cfrom": 7, "cto": 8, "text": "を", "lemma": "を", "pos": "助詞", "tags": [{"value": "格助詞", "type": "sc1"}, {"value": "一般", "type": "sc2"}, {"value": "助詞-格助詞-一般", "type": "pos3"}, {"value": "を", "type": "reading_hira"}]}, {"cfrom": 8, "cto": 11, "text": "食べる", "lemma": "食べる", "pos": "動詞", "tags": [{"value": "自立", "type": "sc1"}, {"value": "一段", "type": "inf"}, {"value": "基本形", "type": "conj"}, {"value": "動詞-自立", "type": "pos3"}, {"value": "たべる", "type": "reading_hira"}]}, {"cfrom": 11, "cto": 12, "text": "。", "lemma": "。", "pos": "記号", "tags": [{"value": "句点", "type": "sc1"}, {"value": "記号-句点", "type": "pos3"}, {"value": "。", "type": "reading_hira"}]}], "ID": "3"}
31 changes: 22 additions & 9 deletions test/test_tig.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from chirptext import chio
from chirptext import deko
from chirptext import dekomecab

from speach import ttl
from speach import ttlig
Expand All @@ -36,9 +35,10 @@
TRANSCRIPT_FILE = os.path.join(TEST_DIR, 'data', 'test_transcript.tab')
TRANSCRIPT_EXPECTED_FILE = os.path.join(TEST_DIR, 'data', 'test_transcript.human.tab')

_MECAB_VERSION = None
_CAN_PARSE_JP = None
try:
_MECAB_VERSION = dekomecab.version()
engines = deko.engines()
_CAN_PARSE_JP = len(engines)
except Exception:
pass

Expand All @@ -62,9 +62,9 @@ def test_tokenizer(self):
glosses = ttlig.tokenize(gloss_string)
sent.tokens = tokens
for tk, gl in zip(sent.tokens, glosses):
tk.new_tag(gl, tagtype='gloss')
tk.tag.gloss = gl
# verify imported information
actual = [(t.text, t.get_tag('gloss').label) for t in sent]
actual = [(t.text, t.tag.gloss.value) for t in sent]
expected = [('It', 'SUBJ'), ('works', 'work'), ('.', 'PUNC')]
self.assertEqual(expected, actual)

Expand Down Expand Up @@ -220,18 +220,18 @@ def test_read_invalid_ttlig(self):
invalid_file = os.path.join(TEST_DIR, 'data', 'testig_invalid.txt')
self.assertRaises(Exception, lambda: ttlig.read(invalid_file))

@unittest.skipIf(not _MECAB_VERSION, "Deko is not available, test_make_furi_token is skipped.")
@unittest.skipIf(not _CAN_PARSE_JP, "Deko is not available, test_make_furi_token is skipped.")
def test_make_furi_token(self):
s = deko.parse('友達')
# f = ttlig.mctoken_to_furi(s[0])
f = ttlig.RubyToken.from_furi(s[0].surface, s[0].reading_hira())
f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
self.assertEqual(f.to_code(), '{友達/ともだち}')
# half-width char
s = deko.parse('0')
f = ttlig.RubyToken.from_furi(s[0].surface, s[0].reading_hira())
f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
self.assertEqual(f.to_code(), '0')

@unittest.skipIf(not _MECAB_VERSION, "Deko is not available, test_tig_parsing is skipped.")
@unittest.skipIf(not _CAN_PARSE_JP, "Deko is not available, test_tig_parsing is skipped.")
def test_tig_parsing(self):
igrow = ttlig.text_to_igrow('友達と巡り会った。')
self.assertEqual(igrow.text, '友達と巡り会った。')
Expand All @@ -251,6 +251,19 @@ def test_tig_parsing(self):
igrow = ttlig.text_to_igrow('0時だ。')
self.assertEqual(igrow.text, '0時だ。')
self.assertEqual(igrow.tokens, '0 {時/じ} だ 。')
# test export to ttl
ttl_sent = igrow.to_ttl()
expected = {'tags': [{'type': 'pos', 'value': '名詞-数 名詞-接尾-助数詞 助動詞 記号-句点'}],
'text': '0時だ。',
'tokens': [{'cfrom': 0, 'cto': 1, 'pos': '名詞-数', 'text': '0'},
{'cfrom': 1,
'cto': 2,
'pos': '名詞-接尾-助数詞',
'tags': [{'type': 'furi', 'value': '{時/じ}'}],
'text': '時'},
{'cfrom': 2, 'cto': 3, 'pos': '助動詞', 'text': 'だ'},
{'cfrom': 3, 'cto': 4, 'pos': '記号-句点', 'text': '。'}]}
self.assertEqual(expected, ttl_sent.to_dict())

def test_parsing_aligned_text(self):
print("Testing TTLIG with multiple spaces")
Expand Down
Loading

0 comments on commit 6e4d6c6

Please sign in to comment.