Skip to content

Commit a5355b2

Browse files
authored
[optimizer] report per-transform metrics (#30806)
Adds metrics for per-transform information (how often did the transform fire? how often did it change the AST?); enrich the end-to-end optimization time metric to add log-lines with per-transform timing information, which should aid in debugging. Designed with advice from @antiguru and some tweaks from @ggevay. ❤️
1 parent 572b41f commit a5355b2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+441
-131
lines changed

Diff for: Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: src/adapter/src/coord.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -4019,7 +4019,10 @@ pub fn serve(
40194019

40204020
let metrics = Metrics::register_into(&metrics_registry);
40214021
let metrics_clone = metrics.clone();
4022-
let optimizer_metrics = OptimizerMetrics::register_into(&metrics_registry);
4022+
let optimizer_metrics = OptimizerMetrics::register_into(
4023+
&metrics_registry,
4024+
catalog.system_config().optimizer_e2e_latency_warning_threshold(),
4025+
);
40234026
let segment_client_clone = segment_client.clone();
40244027
let coord_now = now.clone();
40254028
let advance_timelines_interval = tokio::time::interval(catalog.config().timestamp_interval);

Diff for: src/adapter/src/optimize/copy_to.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,12 @@ impl Optimize<HirRelationExpr> for Optimizer {
160160

161161
// MIR ⇒ MIR optimization (local)
162162
let mut df_meta = DataflowMetainfo::default();
163-
let mut transform_ctx =
164-
TransformCtx::local(&self.config.features, &self.typecheck_ctx, &mut df_meta);
163+
let mut transform_ctx = TransformCtx::local(
164+
&self.config.features,
165+
&self.typecheck_ctx,
166+
&mut df_meta,
167+
Some(&self.metrics),
168+
);
165169
let expr = optimize_mir_local(expr, &mut transform_ctx)?.into_inner();
166170

167171
self.duration += time.elapsed();
@@ -317,6 +321,7 @@ impl<'s> Optimize<LocalMirPlan<Resolved<'s>>> for Optimizer {
317321
&self.config.features,
318322
&self.typecheck_ctx,
319323
&mut df_meta,
324+
Some(&self.metrics),
320325
);
321326
// Run global optimization.
322327
mz_transform::optimize_dataflow(&mut df_desc, &mut transform_ctx, false)?;

Diff for: src/adapter/src/optimize/index.rs

+1
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ impl Optimize<Index> for Optimizer {
178178
&self.config.features,
179179
&self.typecheck_ctx,
180180
&mut df_meta,
181+
Some(&self.metrics),
181182
);
182183
// Run global optimization.
183184
mz_transform::optimize_dataflow(&mut df_desc, &mut transform_ctx, false)?;

Diff for: src/adapter/src/optimize/materialized_view.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,12 @@ impl Optimize<HirRelationExpr> for Optimizer {
194194

195195
// MIR ⇒ MIR optimization (local)
196196
let mut df_meta = DataflowMetainfo::default();
197-
let mut transform_ctx =
198-
TransformCtx::local(&self.config.features, &self.typecheck_ctx, &mut df_meta);
197+
let mut transform_ctx = TransformCtx::local(
198+
&self.config.features,
199+
&self.typecheck_ctx,
200+
&mut df_meta,
201+
Some(&self.metrics),
202+
);
199203
let expr = optimize_mir_local(expr, &mut transform_ctx)?.into_inner();
200204

201205
self.duration += time.elapsed();
@@ -282,6 +286,7 @@ impl Optimize<LocalMirPlan> for Optimizer {
282286
&self.config.features,
283287
&self.typecheck_ctx,
284288
&mut df_meta,
289+
Some(&self.metrics),
285290
);
286291
// Apply source monotonicity overrides.
287292
for id in self.force_source_non_monotonic.iter() {

Diff for: src/adapter/src/optimize/peek.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,12 @@ impl Optimize<HirRelationExpr> for Optimizer {
178178

179179
// MIR ⇒ MIR optimization (local)
180180
let mut df_meta = DataflowMetainfo::default();
181-
let mut transform_ctx =
182-
TransformCtx::local(&self.config.features, &self.typecheck_ctx, &mut df_meta);
181+
let mut transform_ctx = TransformCtx::local(
182+
&self.config.features,
183+
&self.typecheck_ctx,
184+
&mut df_meta,
185+
Some(&self.metrics),
186+
);
183187
let expr = optimize_mir_local(expr, &mut transform_ctx)?.into_inner();
184188

185189
self.duration += time.elapsed();
@@ -313,6 +317,7 @@ impl<'s> Optimize<LocalMirPlan<Resolved<'s>>> for Optimizer {
313317
&self.config.features,
314318
&self.typecheck_ctx,
315319
&mut df_meta,
320+
Some(&self.metrics),
316321
);
317322

318323
// Let's already try creating a fast path plan. If successful, we don't need to run the

Diff for: src/adapter/src/optimize/subscribe.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,12 @@ impl Optimize<SubscribeFrom> for Optimizer {
226226
// let expr = expr.lower(&self.config)?;
227227

228228
// MIR ⇒ MIR optimization (local)
229-
let mut transform_ctx =
230-
TransformCtx::local(&self.config.features, &self.typecheck_ctx, &mut df_meta);
229+
let mut transform_ctx = TransformCtx::local(
230+
&self.config.features,
231+
&self.typecheck_ctx,
232+
&mut df_meta,
233+
Some(&self.metrics),
234+
);
231235
let expr = optimize_mir_local(expr, &mut transform_ctx)?;
232236

233237
df_builder.import_view_into_dataflow(
@@ -268,6 +272,7 @@ impl Optimize<SubscribeFrom> for Optimizer {
268272
&self.config.features,
269273
&self.typecheck_ctx,
270274
&mut df_meta,
275+
Some(&self.metrics),
271276
);
272277
// Run global optimization.
273278
mz_transform::optimize_dataflow(&mut df_desc, &mut transform_ctx, false)?;

Diff for: src/adapter/src/optimize/view.rs

+6-2
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,12 @@ impl Optimize<HirRelationExpr> for Optimizer {
6464
let mut expr = expr.lower(&self.config, self.metrics.as_ref())?;
6565

6666
let mut df_meta = DataflowMetainfo::default();
67-
let mut transform_ctx =
68-
TransformCtx::local(&self.config.features, &self.typecheck_ctx, &mut df_meta);
67+
let mut transform_ctx = TransformCtx::local(
68+
&self.config.features,
69+
&self.typecheck_ctx,
70+
&mut df_meta,
71+
self.metrics.as_ref(),
72+
);
6973

7074
// First, we run a very simple optimizer pipeline, which only folds constants. This takes
7175
// care of constant INSERTs. (This optimizer is also used for INSERTs, not just VIEWs.)

Diff for: src/sql/src/optimizer_metrics.rs

+55-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
//! Metrics collected by the optimizer.
1111
12+
use std::cell::RefCell;
1213
use std::time::Duration;
1314

1415
use mz_ore::metric;
@@ -20,33 +21,69 @@ use prometheus::{HistogramVec, IntCounterVec};
2021
#[derive(Debug, Clone)]
2122
pub struct OptimizerMetrics {
2223
e2e_optimization_time_seconds: HistogramVec,
24+
e2e_optimization_time_seconds_log_threshold: Duration,
2325
outer_join_lowering_cases: IntCounterVec,
26+
transform_hits: IntCounterVec,
27+
transform_total: IntCounterVec,
28+
/// Local storage of transform times; these are emitted as part of the
29+
/// log-line when end-to-end optimization times exceed the configured threshold.
30+
transform_time_seconds: RefCell<std::collections::BTreeMap<String, Vec<Duration>>>,
2431
}
2532

2633
impl OptimizerMetrics {
27-
pub fn register_into(registry: &MetricsRegistry) -> Self {
34+
pub fn register_into(
35+
registry: &MetricsRegistry,
36+
e2e_optimization_time_seconds_log_threshold: Duration,
37+
) -> Self {
2838
Self {
2939
e2e_optimization_time_seconds: registry.register(metric!(
3040
name: "mz_optimizer_e2e_optimization_time_seconds",
3141
help: "A histogram of end-to-end optimization times since restart.",
3242
var_labels: ["object_type"],
3343
buckets: histogram_seconds_buckets(0.000_128, 8.0),
3444
)),
45+
e2e_optimization_time_seconds_log_threshold,
3546
outer_join_lowering_cases: registry.register(metric!(
3647
name: "outer_join_lowering_cases",
3748
help: "How many times the different outer join lowering cases happened.",
3849
var_labels: ["case"],
3950
)),
51+
transform_hits: registry.register(metric!(
52+
name: "transform_hits",
53+
help: "How many times a given transform changed the plan.",
54+
var_labels: ["transform"],
55+
)),
56+
transform_total: registry.register(metric!(
57+
name: "transform_total",
58+
help: "How many times a given transform was applied.",
59+
var_labels: ["transform"],
60+
)),
61+
transform_time_seconds: RefCell::new(std::collections::BTreeMap::new()),
4062
}
4163
}
4264

4365
pub fn observe_e2e_optimization_time(&self, object_type: &str, duration: Duration) {
4466
self.e2e_optimization_time_seconds
4567
.with_label_values(&[object_type])
4668
.observe(duration.as_secs_f64());
47-
if duration > Duration::from_millis(500) {
69+
if duration > self.e2e_optimization_time_seconds_log_threshold {
70+
let transform_times = self
71+
.transform_time_seconds
72+
.take()
73+
.into_iter()
74+
.map(|(k, v)| {
75+
(
76+
k,
77+
v.into_iter()
78+
.map(|duration| duration.as_nanos())
79+
.collect::<Vec<_>>(),
80+
)
81+
})
82+
.collect::<Vec<_>>();
4883
tracing::warn!(
4984
object_type = object_type,
85+
transform_times = serde_json::to_string(&transform_times)
86+
.unwrap_or_else(|_| format!("{:?}", transform_times)),
5087
duration = format!("{}ms", duration.as_millis()),
5188
"optimizer took more than 500ms"
5289
);
@@ -58,4 +95,20 @@ impl OptimizerMetrics {
5895
.with_label_values(&[case])
5996
.inc()
6097
}
98+
99+
pub fn inc_transform(&self, hit: bool, transform: &str) {
100+
if hit {
101+
self.transform_hits.with_label_values(&[transform]).inc();
102+
}
103+
self.transform_total.with_label_values(&[transform]).inc();
104+
}
105+
106+
pub fn observe_transform_time(&self, transform: &str, duration: Duration) {
107+
let mut transform_time_seconds = self.transform_time_seconds.borrow_mut();
108+
if let Some(times) = transform_time_seconds.get_mut(transform) {
109+
times.push(duration);
110+
} else {
111+
transform_time_seconds.insert(transform.to_string(), vec![duration]);
112+
}
113+
}
61114
}

Diff for: src/sql/src/session/vars.rs

+5
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,7 @@ impl SystemVars {
12151215
&USER_STORAGE_MANAGED_COLLECTIONS_BATCH_DURATION,
12161216
&ENABLE_CREATE_TABLE_FROM_SOURCE,
12171217
&FORCE_SOURCE_TABLE_SYNTAX,
1218+
&OPTIMIZER_E2E_LATENCY_WARNING_THRESHOLD,
12181219
];
12191220

12201221
let dyncfgs = mz_dyncfgs::all_dyncfgs();
@@ -2196,6 +2197,10 @@ impl SystemVars {
21962197
*self.expect_value(&FORCE_SOURCE_TABLE_SYNTAX)
21972198
}
21982199

2200+
pub fn optimizer_e2e_latency_warning_threshold(&self) -> Duration {
2201+
*self.expect_value(&OPTIMIZER_E2E_LATENCY_WARNING_THRESHOLD)
2202+
}
2203+
21992204
/// Returns whether the named variable is a compute configuration parameter
22002205
/// (things that go in `ComputeParameters` and are sent to replicas via `UpdateConfiguration`
22012206
/// commands).

Diff for: src/sql/src/session/vars/definitions.rs

+9
Original file line numberDiff line numberDiff line change
@@ -1539,6 +1539,15 @@ pub static FORCE_SOURCE_TABLE_SYNTAX: VarDefinition = VarDefinition::new(
15391539
true,
15401540
);
15411541

1542+
pub static OPTIMIZER_E2E_LATENCY_WARNING_THRESHOLD: VarDefinition = VarDefinition::new(
1543+
"optimizer_e2e_latency_warning_threshold",
1544+
value!(Duration; Duration::from_millis(500)),
1545+
"Sets the duration that a query can take to compile; queries that take longer \
1546+
will trigger a warning. If this value is specified without units, it is taken as \
1547+
milliseconds. A value of zero disables the timeout (Materialize).",
1548+
true,
1549+
);
1550+
15421551
/// Configuration for gRPC client connections.
15431552
pub mod grpc_client {
15441553
use super::*;

Diff for: src/transform/BUILD.bazel

+5
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ rust_library(
3333
"//src/expr:mz_expr",
3434
"//src/ore:mz_ore",
3535
"//src/repr:mz_repr",
36+
"//src/sql:mz_sql",
3637
] + all_crate_deps(normal = True),
3738
)
3839

@@ -70,6 +71,7 @@ rust_test(
7071
"//src/lowertest:mz_lowertest",
7172
"//src/ore:mz_ore",
7273
"//src/repr:mz_repr",
74+
"//src/sql:mz_sql",
7375
] + all_crate_deps(
7476
normal = True,
7577
normal_dev = True,
@@ -87,6 +89,7 @@ rust_doc_test(
8789
"//src/lowertest:mz_lowertest",
8890
"//src/ore:mz_ore",
8991
"//src/repr:mz_repr",
92+
"//src/sql:mz_sql",
9093
] + all_crate_deps(
9194
normal = True,
9295
normal_dev = True,
@@ -124,6 +127,7 @@ rust_test(
124127
"//src/lowertest:mz_lowertest",
125128
"//src/ore:mz_ore",
126129
"//src/repr:mz_repr",
130+
"//src/sql:mz_sql",
127131
] + all_crate_deps(
128132
normal = True,
129133
normal_dev = True,
@@ -161,6 +165,7 @@ rust_test(
161165
"//src/lowertest:mz_lowertest",
162166
"//src/ore:mz_ore",
163167
"//src/repr:mz_repr",
168+
"//src/sql:mz_sql",
164169
] + all_crate_deps(
165170
normal = True,
166171
normal_dev = True,

Diff for: src/transform/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ mz-compute-types = { path = "../compute-types" }
1717
mz-expr = { path = "../expr" }
1818
mz-ore = { path = "../ore" }
1919
mz-repr = { path = "../repr", features = ["tracing_"] }
20+
mz-sql = { path = "../sql" }
2021
num-traits = "0.2"
2122
ordered-float = { version = "4.2.0", features = ["serde"] }
2223
paste = "1.0.11"

Diff for: src/transform/src/canonicalization.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@ use crate::TransformCtx;
3232
pub struct ReduceScalars;
3333

3434
impl crate::Transform for ReduceScalars {
35+
fn name(&self) -> &'static str {
36+
"ReduceScalars"
37+
}
38+
3539
#[mz_ore::instrument(
3640
target = "optimizer",
3741
level = "debug",
3842
fields(path.segment = "reduce_scalars")
3943
)]
40-
fn transform(
44+
fn actually_perform_transform(
4145
&self,
4246
relation: &mut MirRelationExpr,
4347
ctx: &mut TransformCtx,

Diff for: src/transform/src/canonicalization/flatmap_to_map.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@ use crate::TransformCtx;
2020
pub struct FlatMapToMap;
2121

2222
impl crate::Transform for FlatMapToMap {
23+
fn name(&self) -> &'static str {
24+
"FlatMapToMap"
25+
}
26+
2327
#[mz_ore::instrument(
2428
target = "optimizer",
2529
level = "debug",
2630
fields(path.segment = "flatmap_to_map")
2731
)]
28-
fn transform(
32+
fn actually_perform_transform(
2933
&self,
3034
relation: &mut MirRelationExpr,
3135
_: &mut TransformCtx,

Diff for: src/transform/src/canonicalization/projection_extraction.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@ use crate::TransformCtx;
2020
pub struct ProjectionExtraction;
2121

2222
impl crate::Transform for ProjectionExtraction {
23+
fn name(&self) -> &'static str {
24+
"ProjectionExtraction"
25+
}
26+
2327
#[mz_ore::instrument(
2428
target = "optimizer",
2529
level = "debug",
2630
fields(path.segment = "projection_extraction")
2731
)]
28-
fn transform(
32+
fn actually_perform_transform(
2933
&self,
3034
relation: &mut MirRelationExpr,
3135
_: &mut TransformCtx,

Diff for: src/transform/src/canonicalization/topk_elision.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,16 @@ use crate::TransformCtx;
1919
pub struct TopKElision;
2020

2121
impl crate::Transform for TopKElision {
22+
fn name(&self) -> &'static str {
23+
"TopKElision"
24+
}
25+
2226
#[mz_ore::instrument(
2327
target = "optimizer",
2428
level = "debug",
2529
fields(path.segment = "topk_elision")
2630
)]
27-
fn transform(
31+
fn actually_perform_transform(
2832
&self,
2933
relation: &mut MirRelationExpr,
3034
_: &mut TransformCtx,

0 commit comments

Comments
 (0)