Skip to content

Commit

Permalink
support arbitrary wildcard (quickwit-oss#5606)
Browse files Browse the repository at this point in the history
* run wildcard as automatons

* add tests for new wildcard queries

* refactor json path handling for automaton queries

* regex support

* run automaton in search thread pool

* cleanup, refactor and test

* improve error messages

Co-authored-by: Adrien Guillo <[email protected]>

---------

Co-authored-by: Adrien Guillo <[email protected]>
  • Loading branch information
trinity-1686a and guilload authored Jan 10, 2025
1 parent b3dd76a commit 385c5b5
Show file tree
Hide file tree
Showing 16 changed files with 801 additions and 158 deletions.
36 changes: 21 additions & 15 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -328,12 +328,13 @@ quickwit-serve = { path = "quickwit-serve" }
quickwit-storage = { path = "quickwit-storage" }
quickwit-telemetry = { path = "quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "71cf198", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d281ca3", default-features = false, features = [
"lz4-compression",
"mmap",
"quickwit",
"zstd-compression",
] }
tantivy-fst = "0.5"

# This is actually not used directly the goal is to fix the version
# used by reqwest.
Expand Down
59 changes: 59 additions & 0 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ pub struct TermRange {
pub limit: Option<u64>,
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
/// Supported automaton types to warmup
pub enum Automaton {
/// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if
/// inside a json field
Regex(Option<Vec<u8>>, String),
// we could add termset query here, instead of downloading the whole dictionary
}

/// Description of how a fast field should be warmed up
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct FastFieldWarmupInfo {
Expand All @@ -109,6 +118,8 @@ pub struct WarmupInfo {
pub terms_grouped_by_field: HashMap<Field, HashMap<Term, bool>>,
/// Term ranges to warmup, and whether their position is needed too.
pub term_ranges_grouped_by_field: HashMap<Field, HashMap<TermRange, bool>>,
/// Automatons to warmup
pub automatons_grouped_by_field: HashMap<Field, HashSet<Automaton>>,
}

impl WarmupInfo {
Expand Down Expand Up @@ -143,6 +154,11 @@ impl WarmupInfo {
*sub_map.entry(term_range).or_default() |= include_position;
}
}

for (field, automatons) in other.automatons_grouped_by_field.into_iter() {
let sub_map = self.automatons_grouped_by_field.entry(field).or_default();
sub_map.extend(automatons);
}
}

/// Simplify a WarmupInfo, removing some redundant tasks
Expand Down Expand Up @@ -599,6 +615,13 @@ mod tests {
.collect()
}

fn automaton_hashset(elements: &[&str]) -> HashSet<Automaton> {
elements
.iter()
.map(|elem| Automaton::Regex(None, elem.to_string()))
.collect()
}

fn hashset_field(elements: &[u32]) -> HashSet<Field> {
elements
.iter()
Expand Down Expand Up @@ -648,6 +671,12 @@ mod tests {
(2, "term1", false),
(2, "term2", false),
]),
automatons_grouped_by_field: [(
Field::from_field_id(1),
automaton_hashset(&["my_reg.*ex"]),
)]
.into_iter()
.collect(),
};

// merging with default has no impact
Expand All @@ -665,6 +694,12 @@ mod tests {
(3, "term1", false),
(2, "term2", true),
]),
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])),
]
.into_iter()
.collect(),
};
wi_base.merge(wi_2.clone());

Expand Down Expand Up @@ -712,6 +747,17 @@ mod tests {
);
}

let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")];
for (field, regex) in expected_automatons {
let field = Field::from_field_id(field);
let automaton = Automaton::Regex(None, regex.to_string());
assert!(wi_base
.automatons_grouped_by_field
.get(&field)
.unwrap()
.contains(&automaton));
}

// merge is idempotent
let mut wi_cloned = wi_base.clone();
wi_cloned.merge(wi_2);
Expand All @@ -734,6 +780,13 @@ mod tests {
(1, "term2", true),
(2, "term3", false),
]),
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
]
.into_iter()
.collect(),
};
let expected = WarmupInfo {
term_dict_fields: hashset_field(&[1]),
Expand All @@ -744,6 +797,12 @@ mod tests {
(1, "term2", true),
(2, "term3", false),
]),
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
]
.into_iter()
.collect(),
};

warmup_info.simplify();
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-doc-mapper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ mod routing_expression;
pub mod tag_pruning;

pub use doc_mapper::{
analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo,
analyze_text, Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo,
FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions,
QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo,
};
Expand Down
Loading

0 comments on commit 385c5b5

Please sign in to comment.