Skip to content

Commit

Permalink
feat(dict): implement trie dictionary dumping
Browse files Browse the repository at this point in the history
  • Loading branch information
kanru committed Mar 10, 2024
1 parent 8fc1634 commit c06807d
Show file tree
Hide file tree
Showing 13 changed files with 243 additions and 45 deletions.
8 changes: 1 addition & 7 deletions src/capi/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -730,13 +730,7 @@ pub extern "C" fn chewing_userphrase_enumerate(ctx: *mut ChewingContext) -> c_in
None => return -1,
};

ctx.userphrase_iter = Some(
ctx.editor
.user_dict()
.entries()
.unwrap_or(Box::new(iter::empty()))
.peekable(),
);
ctx.userphrase_iter = Some(ctx.editor.user_dict().entries().peekable());
0
}

Expand Down
2 changes: 1 addition & 1 deletion src/capi/public.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ pub struct ChewingContext {
pub(crate) kbcompat_iter: Option<Peekable<Box<dyn Iterator<Item = KeyboardLayoutCompat>>>>,
pub(crate) cand_iter: Option<Peekable<Box<dyn Iterator<Item = String>>>>,
pub(crate) interval_iter: Option<Peekable<Box<dyn Iterator<Item = Interval>>>>,
pub(crate) userphrase_iter: Option<Peekable<DictEntries>>,
pub(crate) userphrase_iter: Option<Peekable<DictEntries<'static>>>,
pub(crate) sel_keys: SelKeys,
}

Expand Down
6 changes: 3 additions & 3 deletions src/dictionary/cdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl Dictionary for CdbDictionary {
self.inner.lookup_first_n_phrases(syllables, first)
}

fn entries(&self) -> Option<DictEntries> {
fn entries(&self) -> DictEntries<'_> {
self.inner.entries()
}

Expand All @@ -139,7 +139,7 @@ impl Dictionary for CdbDictionary {
}
let mut writer = CDBWriter::create(&self.path).map_err(Error::from)?;
writer.add(b"INFO", &[]).map_err(Error::from)?;
for entry in self.entries().unwrap() {
for entry in self.entries() {
let mut data_buf = vec![];
write_phrase(&mut data_buf, &entry.1).map_err(Error::from)?;
writer
Expand Down Expand Up @@ -302,7 +302,7 @@ mod tests {
vec![syl![Z, TONE4], syl![D, I, AN, TONE3]],
Phrase::from(("dict", 1, 2))
)],
dict.entries().unwrap().collect::<Vec<_>>()
dict.entries().collect::<Vec<_>>()
);
Ok(())
}
Expand Down
6 changes: 3 additions & 3 deletions src/dictionary/kv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,13 +206,13 @@ where
phrases
}

pub(crate) fn entries(&self) -> Option<DictEntries> {
Some(Box::new(
pub(crate) fn entries(&self) -> DictEntries<'_> {
Box::new(
self.entries_iter()
.map(|it| (phrase_from_bytes(&it.0), it.1))
.collect::<Vec<_>>()
.into_iter(),
))
)
}

pub(crate) fn add_phrase(
Expand Down
5 changes: 3 additions & 2 deletions src/dictionary/layered.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,9 @@ impl Dictionary for LayeredDictionary {
phrases
}

fn entries(&self) -> Option<DictEntries> {
None
fn entries(&self) -> DictEntries<'_> {
// FIXME
Box::new(iter::empty())

Check warning on line 113 in src/dictionary/layered.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/layered.rs#L111-L113

Added lines #L111 - L113 were not covered by tests
}

fn about(&self) -> DictionaryInfo {
Expand Down
5 changes: 1 addition & 4 deletions src/dictionary/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,7 @@ impl UserDictionaryLoader {
if data_path != cdb_path && cdb_path.exists() {
let cdb_dict = CdbDictionary::open(cdb_path)
.map_err(|e| io::Error::new(io::ErrorKind::Other, Box::new(e)))?;
for (syllables, phrase) in cdb_dict
.entries()
.expect("CDB dictionary should support entries()")
{
for (syllables, phrase) in cdb_dict.entries() {
fresh_dict
.add_phrase(&syllables, phrase)
.map_err(|e| io::Error::new(io::ErrorKind::Other, Box::new(e)))?;
Expand Down
16 changes: 8 additions & 8 deletions src/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ impl Display for Phrase {
pub type Phrases<'a> = Box<dyn Iterator<Item = Phrase> + 'a>;

/// TODO: doc
pub type DictEntries = Box<dyn Iterator<Item = (Vec<Syllable>, Phrase)>>;
pub type DictEntries<'a> = Box<dyn Iterator<Item = (Vec<Syllable>, Phrase)> + 'a>;

/// An interface for looking up dictionaries.
///
Expand Down Expand Up @@ -308,9 +308,7 @@ pub trait Dictionary: Any + Debug {
self.lookup_first_n_phrases(syllables, usize::MAX)
}
/// Returns an iterator to all phrases in the dictionary.
///
/// Some dictionary backend does not support this operation.
fn entries(&self) -> Option<DictEntries>;
fn entries(&self) -> DictEntries<'_>;
/// Returns information about the dictionary instance.
fn about(&self) -> DictionaryInfo;
/// Reopens the dictionary if it was changed by a different process
Expand Down Expand Up @@ -406,10 +404,12 @@ impl Dictionary for HashMap<Vec<Syllable>, Vec<Phrase>> {
phrases
}

fn entries(&self) -> Option<DictEntries> {
Some(Box::new(self.clone().into_iter().flat_map(|(k, v)| {
v.into_iter().map(move |phrase| (k.clone(), phrase.clone()))
})))
fn entries(&self) -> DictEntries<'_> {
Box::new(
self.clone()
.into_iter()
.flat_map(|(k, v)| v.into_iter().map(move |phrase| (k.clone(), phrase.clone()))),
)
}

fn about(&self) -> DictionaryInfo {
Expand Down
6 changes: 3 additions & 3 deletions src/dictionary/sqlite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,15 +298,15 @@ impl Dictionary for SqliteDictionary {
}

// FIXME too many clone
fn entries(&self) -> Option<DictEntries> {
fn entries(&self) -> DictEntries<'_> {
let mut stmt = self
.conn
.prepare_cached(
"SELECT syllables, phrase, max(freq, coalesce(user_freq, 0)), time
FROM dictionary_v1 LEFT JOIN userphrase_v2 ON userphrase_id = id",
)
.expect("SQL error");
Some(Box::new(
Box::new(
stmt.query_map([], |row| {
let (syllables_bytes, phrase, freq, time): (Vec<u8>, String, _, _) =
row.try_into()?;
Expand All @@ -329,7 +329,7 @@ impl Dictionary for SqliteDictionary {
.map(|r| r.unwrap())
.collect::<Vec<_>>()
.into_iter(),
))
)
}

fn about(&self) -> DictionaryInfo {
Expand Down
146 changes: 141 additions & 5 deletions src/dictionary/trie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::{
fmt::Debug,
fs::File,
io::{self, BufWriter, Read, Seek, Write},
mem,
iter, mem,
num::NonZeroUsize,
path::Path,
str,
Expand Down Expand Up @@ -36,7 +36,7 @@ const ILIC: ChunkId = ChunkId { value: *b"ILIC" };
const IREV: ChunkId = ChunkId { value: *b"IREV" };
const ISFT: ChunkId = ChunkId { value: *b"ISFT" };

#[derive(Pod, Zeroable, Copy, Clone)]
#[derive(Pod, Zeroable, Copy, Clone, Debug)]

Check warning on line 39 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L39

Added line #L39 was not covered by tests
#[repr(C)]
struct TrieNodePod {
child_begin_raw: u32,
Expand All @@ -58,7 +58,7 @@ impl TrieNodePod {
}
}

#[derive(Pod, Zeroable, Copy, Clone)]
#[derive(Pod, Zeroable, Copy, Clone, Debug)]

Check warning on line 61 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L61

Added line #L61 was not covered by tests
#[repr(C)]
struct TrieLeafPod {
data_begin_raw: u32,
Expand Down Expand Up @@ -353,8 +353,83 @@ impl Dictionary for TrieDictionary {
.collect()
}

fn entries(&self) -> Option<DictEntries> {
None
fn entries(&self) -> DictEntries<'_> {
let mut results = Vec::new();
let mut stack = Vec::new();
let mut syllables = Vec::new();
let root: &TrieNodePod = from_bytes(&self.dict[..TrieNodePod::SIZE]);
let mut node = root;
let mut done = false;
let make_dict_entry =
|syllables: &[u16], node: &TrieNodePod| -> (Vec<Syllable>, Vec<Phrase>) {
let leaf_data = &self.dict[node.child_begin()..];
let leaf: &TrieLeafPod = from_bytes(&leaf_data[..TrieLeafPod::SIZE]);
debug_assert_eq!(leaf.reserved_zero(), 0);
let phrases = PhrasesIter {
bytes: &self.data[leaf.data_begin()..leaf.data_end()],
}
.collect::<Vec<_>>();
(
syllables
.iter()
// FIXME - skip invalid entry?
.map(|&syl_u16| Syllable::try_from(syl_u16).unwrap())
.collect::<Vec<_>>(),
phrases,
)
};
let it = iter::from_fn(move || {
if !results.is_empty() {
return results.pop();
}
if done {
return None;
}
// descend until find a leaf node which is not also a internal node.
loop {
let child_nodes: &[TrieNodePod] =
cast_slice(&self.dict[node.child_begin()..node.child_end()]);
let mut child_iter = child_nodes.into_iter();
let mut next = child_iter
.next()
.expect("syllable node should have at least one child node");
if next.syllable() == 0 {
// found a leaf syllable node
results.push(make_dict_entry(&syllables, node));
if let Some(second) = child_iter.next() {
next = second;
} else {
break;
}
}
node = next;
syllables.push(node.syllable());
stack.push(child_iter);
}
// ascend until we can go down again
loop {
if let Some(mut child_nodes) = stack.pop() {
syllables.pop();
if let Some(next) = child_nodes.next() {
debug_assert_ne!(next.syllable(), 0);
node = next;
stack.push(child_nodes);
syllables.push(node.syllable());
break;

Check warning on line 418 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L414-L418

Added lines #L414 - L418 were not covered by tests
}
} else {
done = true;
break;
}
}
results.pop()
});
let entries = it.flat_map(|(syllables, phrases)| {
phrases
.into_iter()
.map(move |phrase| (syllables.clone(), phrase))
});
Box::new(entries)
}

fn about(&self) -> DictionaryInfo {
Expand Down Expand Up @@ -1374,4 +1449,65 @@ mod tests {
assert_eq!("version", info.version.unwrap());
assert_eq!("software", info.software.unwrap());
}

#[test]
fn tree_entries() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieDictionaryBuilder::new();
builder.insert(
&vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 1).into(),
)?;

Check warning on line 1462 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L1462

Added line #L1462 was not covered by tests
builder.insert(
&vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策試", 2).into(),
)?;

Check warning on line 1469 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L1469

Added line #L1469 was not covered by tests
builder.insert(
&vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
("測試成功", 3).into(),
)?;

Check warning on line 1478 in src/dictionary/trie.rs

View check run for this annotation

Codecov / codecov/patch

src/dictionary/trie.rs#L1478

Added line #L1478 was not covered by tests
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;

let dict = TrieDictionary::new(&mut cursor)?;
assert_eq!(
vec![
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
Phrase::new("測試成功", 3)
),
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4]
],
Phrase::new("策試", 2)
),
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4]
],
Phrase::new("測試", 1)
),
],
dict.entries().collect::<Vec<_>>()
);
Ok(())
}
}
15 changes: 6 additions & 9 deletions src/editor/estimate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,12 @@ pub struct LaxUserFreqEstimate {
impl LaxUserFreqEstimate {
/// TODO: doc
pub fn open(user_dict: &dyn Dictionary) -> Result<LaxUserFreqEstimate, EstimateError> {
if let Some(entries) = user_dict.entries() {
let lifetime = entries
.map(|it| it.1.last_used().unwrap_or_default())
.max()
.unwrap_or_default();
Ok(LaxUserFreqEstimate { lifetime })
} else {
Ok(LaxUserFreqEstimate { lifetime: 0 })
}
let lifetime = user_dict
.entries()
.map(|it| it.1.last_used().unwrap_or_default())
.max()
.unwrap_or_default();
Ok(LaxUserFreqEstimate { lifetime })
}

pub fn open_in_memory(initial_lifetime: u64) -> LaxUserFreqEstimate {
Expand Down
Loading

0 comments on commit c06807d

Please sign in to comment.