support arbitrary wildcard (quickwit-oss#5606)

* run wildcard as automatons * add tests for new wildcard queries * refactor json path handling for automaton queries * regex support * run automaton in search thread pool * cleanup, refactor and test * improve error messages Co-authored-by: Adrien Guillo <[email protected]> --------- Co-authored-by: Adrien Guillo <[email protected]>
devtron-labs · Jan 10, 2025 · 385c5b5 · 385c5b5
1 parent b3dd76a
commit 385c5b5
Show file tree

Hide file tree

Showing 16 changed files with 801 additions and 158 deletions.
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
@@ -328,12 +328,13 @@ quickwit-serve = { path = "quickwit-serve" }
 quickwit-storage = { path = "quickwit-storage" }
 quickwit-telemetry = { path = "quickwit-telemetry" }
 
-tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "71cf198", default-features = false, features = [
+tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d281ca3", default-features = false, features = [
   "lz4-compression",
   "mmap",
   "quickwit",
   "zstd-compression",
 ] }
+tantivy-fst = "0.5"
 
 # This is actually not used directly the goal is to fix the version
 # used by reqwest.

diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -85,6 +85,15 @@ pub struct TermRange {
     pub limit: Option<u64>,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+/// Supported automaton types to warmup
+pub enum Automaton {
+    /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if
+    /// inside a json field
+    Regex(Option<Vec<u8>>, String),
+    // we could add termset query here, instead of downloading the whole dictionary
+}
+
 /// Description of how a fast field should be warmed up
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct FastFieldWarmupInfo {
@@ -109,6 +118,8 @@ pub struct WarmupInfo {
     pub terms_grouped_by_field: HashMap<Field, HashMap<Term, bool>>,
     /// Term ranges to warmup, and whether their position is needed too.
     pub term_ranges_grouped_by_field: HashMap<Field, HashMap<TermRange, bool>>,
+    /// Automatons to warmup
+    pub automatons_grouped_by_field: HashMap<Field, HashSet<Automaton>>,
 }
 
 impl WarmupInfo {
@@ -143,6 +154,11 @@ impl WarmupInfo {
                 *sub_map.entry(term_range).or_default() |= include_position;
             }
         }
+
+        for (field, automatons) in other.automatons_grouped_by_field.into_iter() {
+            let sub_map = self.automatons_grouped_by_field.entry(field).or_default();
+            sub_map.extend(automatons);
+        }
     }
 
     /// Simplify a WarmupInfo, removing some redundant tasks
@@ -599,6 +615,13 @@ mod tests {
             .collect()
     }
 
+    fn automaton_hashset(elements: &[&str]) -> HashSet<Automaton> {
+        elements
+            .iter()
+            .map(|elem| Automaton::Regex(None, elem.to_string()))
+            .collect()
+    }
+
     fn hashset_field(elements: &[u32]) -> HashSet<Field> {
         elements
             .iter()
@@ -648,6 +671,12 @@ mod tests {
                 (2, "term1", false),
                 (2, "term2", false),
             ]),
+            automatons_grouped_by_field: [(
+                Field::from_field_id(1),
+                automaton_hashset(&["my_reg.*ex"]),
+            )]
+            .into_iter()
+            .collect(),
         };
 
         // merging with default has no impact
@@ -665,6 +694,12 @@ mod tests {
                 (3, "term1", false),
                 (2, "term2", true),
             ]),
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
         wi_base.merge(wi_2.clone());
 
@@ -712,6 +747,17 @@ mod tests {
             );
         }
 
+        let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")];
+        for (field, regex) in expected_automatons {
+            let field = Field::from_field_id(field);
+            let automaton = Automaton::Regex(None, regex.to_string());
+            assert!(wi_base
+                .automatons_grouped_by_field
+                .get(&field)
+                .unwrap()
+                .contains(&automaton));
+        }
+
         // merge is idempotent
         let mut wi_cloned = wi_base.clone();
         wi_cloned.merge(wi_2);
@@ -734,6 +780,13 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
         let expected = WarmupInfo {
             term_dict_fields: hashset_field(&[1]),
@@ -744,6 +797,12 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
 
         warmup_info.simplify();

diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs
@@ -35,7 +35,7 @@ mod routing_expression;
 pub mod tag_pruning;
 
 pub use doc_mapper::{
-    analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo,
+    analyze_text, Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo,
     FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions,
     QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo,
 };