Skip to content

Commit 69e85ff

Browse files
committed
Optimize time ranged search queries
When the search request contains a time range, we aborted the optimization of converting unneeded split searches into count queries.
1 parent de696d4 commit 69e85ff

File tree

1 file changed

+67
-23
lines changed
  • quickwit/quickwit-search/src

1 file changed

+67
-23
lines changed

quickwit/quickwit-search/src/leaf.rs

+67-23
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// limitations under the License.
1414

1515
use std::collections::{HashMap, HashSet};
16-
use std::ops::Bound;
16+
use std::ops::{Bound, ControlFlow};
1717
use std::path::PathBuf;
1818
use std::str::FromStr;
1919
use std::sync::{Arc, Mutex, RwLock};
@@ -942,11 +942,6 @@ fn is_simple_all_query(search_request: &SearchRequest) -> bool {
942942
return false;
943943
}
944944

945-
// TODO: Update the logic to handle start_timestamp end_timestamp ranges
946-
if search_request.start_timestamp.is_some() || search_request.end_timestamp.is_some() {
947-
return false;
948-
}
949-
950945
let Ok(query_ast) = serde_json::from_str(&search_request.query_ast) else {
951946
return false;
952947
};
@@ -1000,6 +995,26 @@ impl CanSplitDoBetter {
1000995
}
1001996
}
1002997

998+
fn is_contained(split: &SplitIdAndFooterOffsets, search_request: &SearchRequest) -> bool {
999+
if let Some(start) = search_request.start_timestamp {
1000+
let Some(split_start) = split.timestamp_start else {
1001+
return false;
1002+
};
1003+
if split_start < start {
1004+
return false;
1005+
}
1006+
}
1007+
if let Some(end) = search_request.end_timestamp {
1008+
let Some(split_end) = split.timestamp_end else {
1009+
return false;
1010+
};
1011+
if split_end >= end {
1012+
return false;
1013+
}
1014+
}
1015+
true
1016+
}
1017+
10031018
/// Optimize the order in which splits will get processed based on how it can skip the most
10041019
/// splits.
10051020
///
@@ -1009,18 +1024,29 @@ impl CanSplitDoBetter {
10091024
/// are the most likely to fill our Top K.
10101025
/// In the future, as split get more metadata per column, we may be able to do this more than
10111026
/// just for timestamp and "unsorted" request.
1012-
fn optimize_split_order(&self, splits: &mut [SplitIdAndFooterOffsets]) {
1027+
///
1028+
/// To skip splits in time ranged queries, we sort the splits first by whether they are
1029+
/// contained in the search request time range.
1030+
fn optimize_split_order(
1031+
&self,
1032+
splits: &mut [SplitIdAndFooterOffsets],
1033+
search_request: &SearchRequest,
1034+
) {
10131035
match self {
10141036
CanSplitDoBetter::SplitIdHigher(_) => {
10151037
splits.sort_unstable_by(|a, b| b.split_id.cmp(&a.split_id))
10161038
}
10171039
CanSplitDoBetter::SplitTimestampHigher(_)
10181040
| CanSplitDoBetter::FindTraceIdsAggregation(_) => {
1019-
splits.sort_unstable_by_key(|split| std::cmp::Reverse(split.timestamp_end()))
1020-
}
1021-
CanSplitDoBetter::SplitTimestampLower(_) => {
1022-
splits.sort_unstable_by_key(|split| split.timestamp_start())
1041+
splits.sort_unstable_by_key(|split| {
1042+
let contained = Self::is_contained(split, search_request);
1043+
(!contained, std::cmp::Reverse(split.timestamp_end()))
1044+
})
10231045
}
1046+
CanSplitDoBetter::SplitTimestampLower(_) => splits.sort_unstable_by_key(|split| {
1047+
let contained = Self::is_contained(split, search_request);
1048+
(!contained, split.timestamp_start())
1049+
}),
10241050
CanSplitDoBetter::Uninformative => (),
10251051
}
10261052
}
@@ -1034,7 +1060,7 @@ impl CanSplitDoBetter {
10341060
request: Arc<SearchRequest>,
10351061
mut splits: Vec<SplitIdAndFooterOffsets>,
10361062
) -> Result<Vec<(SplitIdAndFooterOffsets, SearchRequest)>, SearchError> {
1037-
self.optimize_split_order(&mut splits);
1063+
self.optimize_split_order(&mut splits, &request);
10381064

10391065
if !is_simple_all_query(&request) {
10401066
// no optimization opportunity here.
@@ -1047,17 +1073,35 @@ impl CanSplitDoBetter {
10471073
let num_requested_docs = request.start_offset + request.max_hits;
10481074

10491075
// Calculate the number of splits which are guaranteed to deliver enough documents.
1050-
let min_required_splits = splits
1051-
.iter()
1052-
.map(|split| split.num_docs)
1053-
// computing the partial sum
1054-
.scan(0u64, |partial_sum: &mut u64, num_docs_in_split: u64| {
1055-
*partial_sum += num_docs_in_split;
1056-
Some(*partial_sum)
1057-
})
1058-
.take_while(|partial_sum| *partial_sum < num_requested_docs)
1059-
.count()
1060-
+ 1;
1076+
let min_required_splits = {
1077+
let mut partial_sum = 0u64;
1078+
1079+
let control_flow = splits
1080+
.iter()
1081+
// splits are sorted by whether they are contained in the request time range
1082+
.filter(|split| Self::is_contained(split, &request))
1083+
.map(|split| split.num_docs)
1084+
.try_fold(0usize, |count, num_docs_in_split| {
1085+
partial_sum += num_docs_in_split;
1086+
1087+
if partial_sum >= num_requested_docs {
1088+
ControlFlow::Break(count + 1)
1089+
} else {
1090+
ControlFlow::Continue(count + 1)
1091+
}
1092+
});
1093+
1094+
match control_flow {
1095+
ControlFlow::Break(required) => required,
1096+
ControlFlow::Continue(_) => {
1097+
// didn't reach num_requested_docs, nothing to optimize.
1098+
return Ok(splits
1099+
.into_iter()
1100+
.map(|split| (split, (*request).clone()))
1101+
.collect::<Vec<_>>());
1102+
}
1103+
}
1104+
};
10611105

10621106
// TODO: we maybe want here some deduplication + Cow logic
10631107
let mut split_with_req = splits

0 commit comments

Comments
 (0)