Skip to content

Commit c16bae3

Browse files
authored
perf: evolve promql execution engine (#5691)
* use the same sort option across every prom plan Signed-off-by: Ruihang Xia <[email protected]> * tweak plans Signed-off-by: Ruihang Xia <[email protected]> * wip Signed-off-by: Ruihang Xia <[email protected]> * fix merge compile Signed-off-by: Ruihang Xia <[email protected]> * Revert "wip" This reverts commit db58884. * tweak merge scan Signed-off-by: Ruihang Xia <[email protected]> * handle error Signed-off-by: Ruihang Xia <[email protected]> * pass distribution rule Signed-off-by: Ruihang Xia <[email protected]> * reverse sort order Signed-off-by: Ruihang Xia <[email protected]> * refine plans Signed-off-by: Ruihang Xia <[email protected]> * more optimizations for plans Signed-off-by: Ruihang Xia <[email protected]> * check logical table Signed-off-by: Ruihang Xia <[email protected]> * fix tests Signed-off-by: Ruihang Xia <[email protected]> * wierd tests Signed-off-by: Ruihang Xia <[email protected]> * add comment Signed-off-by: Ruihang Xia <[email protected]> * add test for series_divide Signed-off-by: Ruihang Xia <[email protected]> * update sqlness result Signed-off-by: Ruihang Xia <[email protected]> * fix scalar calculation Signed-off-by: Ruihang Xia <[email protected]> * update sqlness result Signed-off-by: Ruihang Xia <[email protected]> * fix: workaround join partition Signed-off-by: Ruihang Xia <[email protected]> * update proto Signed-off-by: Ruihang Xia <[email protected]> --------- Signed-off-by: Ruihang Xia <[email protected]>
1 parent ee4fe9d commit c16bae3

File tree

25 files changed

+702
-334
lines changed

25 files changed

+702
-334
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ etcd-client = "0.14"
130130
fst = "0.4.7"
131131
futures = "0.3"
132132
futures-util = "0.3"
133-
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "2be0f36b3264e28ab0e1c22a980d0bb634eb3a77" }
133+
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "dd4a1996982534636734674db66e44464b0c0d83" }
134134
hex = "0.4"
135135
http = "1"
136136
humantime = "2.1"

src/cmd/tests/load_config_test.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ fn test_load_metasrv_example_config() {
168168
tracing_sample_ratio: Some(Default::default()),
169169
slow_query: SlowQueryOptions {
170170
enable: false,
171-
threshold: Some(Duration::from_secs(10)),
172-
sample_ratio: Some(1.0),
171+
threshold: None,
172+
sample_ratio: None,
173173
},
174174
..Default::default()
175175
},

src/promql/src/extension_plan/histogram_fold.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ impl ExecutionPlan for HistogramFoldExec {
301301
}
302302

303303
fn required_input_distribution(&self) -> Vec<Distribution> {
304-
vec![Distribution::SinglePartition; self.children().len()]
304+
self.input.required_input_distribution()
305305
}
306306

307307
fn maintains_input_order(&self) -> Vec<bool> {

src/promql/src/extension_plan/instant_manipulate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,9 @@ impl Stream for InstantManipulateStream {
352352
type Item = DataFusionResult<RecordBatch>;
353353

354354
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
355-
let timer = std::time::Instant::now();
356355
let poll = match ready!(self.input.poll_next_unpin(cx)) {
357356
Some(Ok(batch)) => {
357+
let timer = std::time::Instant::now();
358358
self.num_series.add(1);
359359
let result = Ok(batch).and_then(|batch| self.manipulate(batch));
360360
self.metric.elapsed_compute().add_elapsed(timer);

src/promql/src/extension_plan/normalize.rs

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use datafusion::common::{DFSchema, DFSchemaRef, Result as DataFusionResult, Stat
2323
use datafusion::error::DataFusionError;
2424
use datafusion::execution::context::TaskContext;
2525
use datafusion::logical_expr::{EmptyRelation, Expr, LogicalPlan, UserDefinedLogicalNodeCore};
26+
use datafusion::physical_plan::expressions::Column as ColumnExpr;
2627
use datafusion::physical_plan::metrics::{
2728
BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricValue, MetricsSet,
2829
};
@@ -32,7 +33,6 @@ use datafusion::physical_plan::{
3233
};
3334
use datatypes::arrow::array::TimestampMillisecondArray;
3435
use datatypes::arrow::datatypes::SchemaRef;
35-
use datatypes::arrow::error::Result as ArrowResult;
3636
use datatypes::arrow::record_batch::RecordBatch;
3737
use futures::{ready, Stream, StreamExt};
3838
use greptime_proto::substrait_extension as pb;
@@ -55,6 +55,7 @@ pub struct SeriesNormalize {
5555
offset: Millisecond,
5656
time_index_column_name: String,
5757
need_filter_out_nan: bool,
58+
tag_columns: Vec<String>,
5859

5960
input: LogicalPlan,
6061
}
@@ -100,6 +101,7 @@ impl UserDefinedLogicalNodeCore for SeriesNormalize {
100101
time_index_column_name: self.time_index_column_name.clone(),
101102
need_filter_out_nan: self.need_filter_out_nan,
102103
input: inputs.into_iter().next().unwrap(),
104+
tag_columns: self.tag_columns.clone(),
103105
})
104106
}
105107
}
@@ -109,12 +111,14 @@ impl SeriesNormalize {
109111
offset: Millisecond,
110112
time_index_column_name: N,
111113
need_filter_out_nan: bool,
114+
tag_columns: Vec<String>,
112115
input: LogicalPlan,
113116
) -> Self {
114117
Self {
115118
offset,
116119
time_index_column_name: time_index_column_name.as_ref().to_string(),
117120
need_filter_out_nan,
121+
tag_columns,
118122
input,
119123
}
120124
}
@@ -129,6 +133,7 @@ impl SeriesNormalize {
129133
time_index_column_name: self.time_index_column_name.clone(),
130134
need_filter_out_nan: self.need_filter_out_nan,
131135
input: exec_input,
136+
tag_columns: self.tag_columns.clone(),
132137
metric: ExecutionPlanMetricsSet::new(),
133138
})
134139
}
@@ -138,6 +143,7 @@ impl SeriesNormalize {
138143
offset: self.offset,
139144
time_index: self.time_index_column_name.clone(),
140145
filter_nan: self.need_filter_out_nan,
146+
tag_columns: self.tag_columns.clone(),
141147
}
142148
.encode_to_vec()
143149
}
@@ -152,6 +158,7 @@ impl SeriesNormalize {
152158
pb_normalize.offset,
153159
pb_normalize.time_index,
154160
pb_normalize.filter_nan,
161+
pb_normalize.tag_columns,
155162
placeholder_plan,
156163
))
157164
}
@@ -162,6 +169,7 @@ pub struct SeriesNormalizeExec {
162169
offset: Millisecond,
163170
time_index_column_name: String,
164171
need_filter_out_nan: bool,
172+
tag_columns: Vec<String>,
165173

166174
input: Arc<dyn ExecutionPlan>,
167175
metric: ExecutionPlanMetricsSet,
@@ -177,7 +185,14 @@ impl ExecutionPlan for SeriesNormalizeExec {
177185
}
178186

179187
fn required_input_distribution(&self) -> Vec<Distribution> {
180-
vec![Distribution::SinglePartition]
188+
let schema = self.input.schema();
189+
vec![Distribution::HashPartitioned(
190+
self.tag_columns
191+
.iter()
192+
// Safety: the tag column names is verified in the planning phase
193+
.map(|tag| Arc::new(ColumnExpr::new_with_schema(tag, &schema).unwrap()) as _)
194+
.collect(),
195+
)]
181196
}
182197

183198
fn properties(&self) -> &PlanProperties {
@@ -198,6 +213,7 @@ impl ExecutionPlan for SeriesNormalizeExec {
198213
time_index_column_name: self.time_index_column_name.clone(),
199214
need_filter_out_nan: self.need_filter_out_nan,
200215
input: children[0].clone(),
216+
tag_columns: self.tag_columns.clone(),
201217
metric: self.metric.clone(),
202218
}))
203219
}
@@ -288,31 +304,24 @@ impl SeriesNormalizeStream {
288304

289305
// bias the timestamp column by offset
290306
let ts_column_biased = if self.offset == 0 {
291-
ts_column.clone()
307+
Arc::new(ts_column.clone()) as _
292308
} else {
293-
TimestampMillisecondArray::from_iter(
309+
Arc::new(TimestampMillisecondArray::from_iter(
294310
ts_column.iter().map(|ts| ts.map(|ts| ts + self.offset)),
295-
)
311+
))
296312
};
297313
let mut columns = input.columns().to_vec();
298-
columns[self.time_index] = Arc::new(ts_column_biased);
299-
300-
// sort the record batch
301-
let ordered_indices = compute::sort_to_indices(&columns[self.time_index], None, None)?;
302-
let ordered_columns = columns
303-
.iter()
304-
.map(|array| compute::take(array, &ordered_indices, None))
305-
.collect::<ArrowResult<Vec<_>>>()?;
306-
let ordered_batch = RecordBatch::try_new(input.schema(), ordered_columns)?;
314+
columns[self.time_index] = ts_column_biased;
307315

316+
let result_batch = RecordBatch::try_new(input.schema(), columns)?;
308317
if !self.need_filter_out_nan {
309-
return Ok(ordered_batch);
318+
return Ok(result_batch);
310319
}
311320

312321
// TODO(ruihang): consider the "special NaN"
313322
// filter out NaN
314323
let mut filter = vec![true; input.num_rows()];
315-
for column in ordered_batch.columns() {
324+
for column in result_batch.columns() {
316325
if let Some(float_column) = column.as_any().downcast_ref::<Float64Array>() {
317326
for (i, flag) in filter.iter_mut().enumerate() {
318327
if float_column.value(i).is_nan() {
@@ -322,7 +331,7 @@ impl SeriesNormalizeStream {
322331
}
323332
}
324333

325-
let result = compute::filter_record_batch(&ordered_batch, &BooleanArray::from(filter))
334+
let result = compute::filter_record_batch(&result_batch, &BooleanArray::from(filter))
326335
.map_err(|e| DataFusionError::ArrowError(e, None))?;
327336
Ok(result)
328337
}
@@ -338,10 +347,10 @@ impl Stream for SeriesNormalizeStream {
338347
type Item = DataFusionResult<RecordBatch>;
339348

340349
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
341-
let timer = std::time::Instant::now();
342350
let poll = match ready!(self.input.poll_next_unpin(cx)) {
343351
Some(Ok(batch)) => {
344352
self.num_series.add(1);
353+
let timer = std::time::Instant::now();
345354
let result = Ok(batch).and_then(|batch| self.normalize(batch));
346355
self.metric.elapsed_compute().add_elapsed(timer);
347356
Poll::Ready(Some(result))
@@ -399,6 +408,7 @@ mod test {
399408
time_index_column_name: TIME_INDEX_COLUMN.to_string(),
400409
need_filter_out_nan: true,
401410
input: memory_exec,
411+
tag_columns: vec!["path".to_string()],
402412
metric: ExecutionPlanMetricsSet::new(),
403413
});
404414
let session_context = SessionContext::default();
@@ -413,11 +423,11 @@ mod test {
413423
"+---------------------+--------+------+\
414424
\n| timestamp | value | path |\
415425
\n+---------------------+--------+------+\
426+
\n| 1970-01-01T00:01:00 | 0.0 | foo |\
427+
\n| 1970-01-01T00:02:00 | 1.0 | foo |\
416428
\n| 1970-01-01T00:00:00 | 10.0 | foo |\
417429
\n| 1970-01-01T00:00:30 | 100.0 | foo |\
418-
\n| 1970-01-01T00:01:00 | 0.0 | foo |\
419430
\n| 1970-01-01T00:01:30 | 1000.0 | foo |\
420-
\n| 1970-01-01T00:02:00 | 1.0 | foo |\
421431
\n+---------------------+--------+------+",
422432
);
423433

@@ -428,11 +438,12 @@ mod test {
428438
async fn test_offset_record_batch() {
429439
let memory_exec = Arc::new(prepare_test_data());
430440
let normalize_exec = Arc::new(SeriesNormalizeExec {
431-
offset: 1_000, // offset 1s
441+
offset: 1_000,
432442
time_index_column_name: TIME_INDEX_COLUMN.to_string(),
433443
need_filter_out_nan: true,
434444
input: memory_exec,
435445
metric: ExecutionPlanMetricsSet::new(),
446+
tag_columns: vec!["path".to_string()],
436447
});
437448
let session_context = SessionContext::default();
438449
let result = datafusion::physical_plan::collect(normalize_exec, session_context.task_ctx())
@@ -446,11 +457,11 @@ mod test {
446457
"+---------------------+--------+------+\
447458
\n| timestamp | value | path |\
448459
\n+---------------------+--------+------+\
460+
\n| 1970-01-01T00:01:01 | 0.0 | foo |\
461+
\n| 1970-01-01T00:02:01 | 1.0 | foo |\
449462
\n| 1970-01-01T00:00:01 | 10.0 | foo |\
450463
\n| 1970-01-01T00:00:31 | 100.0 | foo |\
451-
\n| 1970-01-01T00:01:01 | 0.0 | foo |\
452464
\n| 1970-01-01T00:01:31 | 1000.0 | foo |\
453-
\n| 1970-01-01T00:02:01 | 1.0 | foo |\
454465
\n+---------------------+--------+------+",
455466
);
456467

src/promql/src/extension_plan/range_manipulate.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ impl ExecutionPlan for RangeManipulateExec {
327327
}
328328

329329
fn required_input_distribution(&self) -> Vec<Distribution> {
330-
vec![Distribution::SinglePartition]
330+
self.input.required_input_distribution()
331331
}
332332

333333
fn with_new_children(
@@ -564,18 +564,24 @@ impl RangeManipulateStream {
564564
let mut ranges = vec![];
565565

566566
// calculate for every aligned timestamp (`curr_ts`), assume the ts column is ordered.
567+
let mut range_start_index = 0usize;
567568
for curr_ts in (self.start..=self.end).step_by(self.interval as _) {
568569
let mut range_start = ts_column.len();
569570
let mut range_end = 0;
570-
for (index, ts) in ts_column.values().iter().enumerate() {
571+
let mut cursor = range_start_index;
572+
while cursor < ts_column.len() {
573+
let ts = ts_column.value(cursor);
571574
if ts + self.range >= curr_ts {
572-
range_start = range_start.min(index);
575+
range_start = range_start.min(cursor);
576+
range_start_index = range_start;
573577
}
574-
if *ts <= curr_ts {
575-
range_end = range_end.max(index);
578+
if ts <= curr_ts {
579+
range_end = range_end.max(cursor);
576580
} else {
581+
range_start_index = range_start_index.checked_sub(1usize).unwrap_or_default();
577582
break;
578583
}
584+
cursor += 1;
579585
}
580586
if range_start > range_end {
581587
ranges.push((0, 0));

src/promql/src/extension_plan/scalar_calculate.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,10 @@ impl Stream for ScalarCalculateStream {
504504
None => {
505505
self.done = true;
506506
return match self.batch.take() {
507-
Some(batch) if !self.have_multi_series => Poll::Ready(Some(Ok(batch))),
507+
Some(batch) if !self.have_multi_series => {
508+
self.metric.record_output(batch.num_rows());
509+
Poll::Ready(Some(Ok(batch)))
510+
}
508511
_ => {
509512
let time_array = (self.start..=self.end)
510513
.step_by(self.interval as _)
@@ -517,6 +520,7 @@ impl Stream for ScalarCalculateStream {
517520
Arc::new(Float64Array::from(vec![f64::NAN; nums])),
518521
],
519522
)?;
523+
self.metric.record_output(nan_batch.num_rows());
520524
Poll::Ready(Some(Ok(nan_batch)))
521525
}
522526
};

0 commit comments

Comments
 (0)