feat(parquet): RowFilter per-predicate tag + observer hook (sketch)

adriangb · claude · adriangb · commit 9c89043f5156 · 2026-05-24T10:03:17.000-05:00
`DatafusionArrowPredicate::try_new_tagged` adds an optional caller `Tag`
plus a `SharedRowFilterObserver` fired on each `evaluate()`.
`build_row_filter_tagged((tag, expr)*, ..., observer)` mirrors
`build_row_filter` but plumbs tags and the shared observer through to
every constructed predicate.

`rows_seen`/`rows_kept` reported to the observer are *conditional* on
prior predicates in the same `RowFilter` (parquet-rs evaluates them in
sequence and filters rows between them) — the marginal-value signal an
adaptive scheduler wants, which falls out of the existing flow.

Untagged callers pay nothing: `Option&lt;SharedRowFilterObserver&gt;` is
`None`, no Arc, no per-batch lock.

Module docs sketch the follow-up `CompoundArrowPredicate` that would own
a sub-tree of physical expressions and do datafusion-side OR / NOT
short-circuit during decode (where exact masks make per-branch stats and
negation sound). Left unimplemented; it stands alone.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs
@@ -52,6 +52,10 @@ pub use page_filter::PagePruningAccessPlanFilter;
 pub use reader::*; // Expose so downstream crates can use it
 pub use row_filter::build_row_filter;
 pub use row_filter::can_expr_be_pushed_down_with_schemas;
+pub use row_filter::{
+    NoopRowFilterObserver, RowFilterObserver, SharedRowFilterObserver,
+    build_row_filter_tagged,
+};
 pub use row_group_filter::RowGroupAccessPlanFilter;
 #[expect(deprecated)]
 pub use schema_coercion::{
diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs
@@ -64,6 +64,42 @@
 //! For example, given a struct column `s {name: Utf8, value: Int32}`:
 //! - `WHERE s['value'] > 5` — pushed down (accesses a primitive leaf)
 //! - `WHERE s IS NOT NULL`  — not pushed down (references the whole struct)
+//!
+//! ## Tracking stats and short-circuiting OR
+//!
+//! Each [`DatafusionArrowPredicate`] optionally carries a [`Tag`] and a
+//! shared [`RowFilterObserver`] (see [`SharedRowFilterObserver`]). When
+//! both are set, every `evaluate` call fires
+//! `observer.on_evaluate(tag, rows_seen, rows_kept)`. Untagged
+//! row filters built via [`build_row_filter`] pay nothing (no Arc,
+//! no lock).
+//!
+//! `rows_seen` here is the number of rows arriving at *this*
+//! predicate, which parquet-rs reduces between predicates as each
+//! predicate's mask is applied. The signal is therefore *conditional*
+//! selectivity — exactly the quantity an adaptive scheduler needs to
+//! learn the marginal value of running predicate `P_i` after
+//! `P_0..P_{i-1}`.
+//!
+//! ### OR short-circuit at the row-filter layer
+//!
+//! Parquet-rs's `RowFilter` is a flat `Vec<Box<dyn ArrowPredicate>>`
+//! and AND-combines them with column staging between predicates. That
+//! makes top-level AND short-circuit "free" — we keep splitting AND
+//! conjuncts into separate `DatafusionArrowPredicate`s for that
+//! reason.
+//!
+//! `OR` and `NOT` inside an individual conjunct cannot be split that
+//! way; today they are evaluated as a single physical expression with
+//! no short-circuit. A natural follow-up is to introduce a
+//! `CompoundArrowPredicate` that owns a tree of physical expressions
+//! (analog of [`datafusion_pruning::PruningPredicateTree`]) and
+//! short-circuits `OR` (skip branch B if branch A already keeps all
+//! surviving rows) and `NOT` (cheap negation), firing the observer
+//! per leaf actually evaluated. Each compound predicate's
+//! [`ArrowPredicate::projection`] returns the union of its leaves'
+//! masks so the parquet decoder reads the required columns up front;
+//! short-circuit then saves *evaluation*, not IO, for that compound.
 
 use std::collections::BTreeSet;
 use std::sync::Arc;
@@ -87,10 +123,39 @@ use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
 use datafusion_physical_expr::{PhysicalExpr, split_conjunction};
 
 use datafusion_physical_plan::metrics;
+use datafusion_pruning::Tag;
 
 use super::ParquetFileMetrics;
 use super::supported_predicates::supports_list_predicates;
 
+/// Observer fired once per row-filter predicate evaluation. The
+/// `rows_seen`/`rows_kept` counts are *conditional* on prior
+/// predicates in the same `RowFilter` (parquet-rs evaluates them in
+/// sequence and filters rows between predicates), which is exactly
+/// what an adaptive scheduler needs to learn "what does running this
+/// predicate next buy us, given everything that came before".
+///
+/// Untagged predicates pass `tag = None`. The default implementation
+/// does nothing, so a `NoopRowFilterObserver` is just `()`.
+pub trait RowFilterObserver: std::fmt::Debug + Send {
+    /// Called after a `DatafusionArrowPredicate::evaluate` runs.
+    fn on_evaluate(&mut self, _tag: Option<Tag>, _rows_seen: usize, _rows_kept: usize) {}
+}
+
+/// No-op observer; row filters built via [`build_row_filter`] (no
+/// tags) use this implicitly.
+#[derive(Debug, Default, Clone, Copy)]
+pub struct NoopRowFilterObserver;
+impl RowFilterObserver for NoopRowFilterObserver {}
+
+/// Shared handle used to plug a single observer into every
+/// `DatafusionArrowPredicate` constructed for a given file. Parquet-rs
+/// owns the predicates after `RowFilter::new`, so each predicate
+/// holds an `Arc<Mutex<...>>` to the observer rather than a borrow.
+///
+/// This mirrors how `metrics::Count` is plumbed through today.
+pub type SharedRowFilterObserver = Arc<std::sync::Mutex<dyn RowFilterObserver>>;
+
 /// A "compiled" predicate passed to `ParquetRecordBatchStream` to perform
 /// row-level filtering during parquet decoding.
 ///
@@ -119,15 +184,26 @@ pub(crate) struct DatafusionArrowPredicate {
     rows_matched: metrics::Count,
     /// how long was spent evaluating this predicate
     time: metrics::Time,
+    /// Optional caller-supplied tag fired through `observer.on_evaluate`.
+    tag: Option<Tag>,
+    /// Optional observer shared across all predicates in a single
+    /// row filter. `None` means no per-evaluate hook (and no
+    /// per-batch lock acquisition cost).
+    observer: Option<SharedRowFilterObserver>,
 }
 
 impl DatafusionArrowPredicate {
-    /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate`
-    pub fn try_new(
+    /// Create a `DatafusionArrowPredicate`. Pass `tag = None` and
+    /// `observer = None` for untagged row filters — the per-evaluate
+    /// observer call is then elided. When both are `Some`, each
+    /// `evaluate` fires `observer.on_evaluate(tag, rows_seen, rows_kept)`.
+    pub fn try_new_tagged(
         candidate: FilterCandidate,
         rows_pruned: metrics::Count,
         rows_matched: metrics::Count,
         time: metrics::Time,
+        tag: Option<Tag>,
+        observer: Option<SharedRowFilterObserver>,
     ) -> Result<Self> {
         let physical_expr =
             reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?;
@@ -138,6 +214,8 @@ impl DatafusionArrowPredicate {
             rows_pruned,
             rows_matched,
             time,
+            tag,
+            observer,
         })
     }
 }
@@ -160,6 +238,14 @@ impl ArrowPredicate for DatafusionArrowPredicate {
                 let num_pruned = bool_arr.len() - num_matched;
                 self.rows_pruned.add(num_pruned);
                 self.rows_matched.add(num_matched);
+                if let Some(obs) = &self.observer {
+                    // Cheap on the No-op path: when `obs` is set, the
+                    // caller has chosen to pay for the lock. Untagged
+                    // RowFilters never construct this Arc.
+                    if let Ok(mut guard) = obs.lock() {
+                        guard.on_evaluate(self.tag, bool_arr.len(), num_matched);
+                    }
+                }
                 timer.stop();
                 Ok(bool_arr)
             })
@@ -1018,63 +1104,104 @@ pub fn build_row_filter(
     metadata: &ParquetMetaData,
     reorder_predicates: bool,
     file_metrics: &ParquetFileMetrics,
+) -> Result<Option<RowFilter>> {
+    // Untagged path: split top-level AND conjuncts, no observer.
+    let predicates: Vec<(Option<Tag>, Arc<dyn PhysicalExpr>)> = split_conjunction(expr)
+        .into_iter()
+        .map(|e| (None, Arc::clone(e)))
+        .collect();
+    build_row_filter_inner(
+        &predicates,
+        file_schema,
+        metadata,
+        reorder_predicates,
+        file_metrics,
+        None,
+    )
+}
+
+/// Tagged variant of [`build_row_filter`]. Each `(tag, expr)` pair
+/// becomes one row-filter predicate; evaluations fire
+/// `observer.on_evaluate(tag, rows_seen, rows_kept)`.
+pub fn build_row_filter_tagged(
+    tagged: &[(Tag, Arc<dyn PhysicalExpr>)],
+    file_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+    observer: &SharedRowFilterObserver,
+) -> Result<Option<RowFilter>> {
+    let predicates: Vec<(Option<Tag>, Arc<dyn PhysicalExpr>)> = tagged
+        .iter()
+        .map(|(tag, expr)| (Some(*tag), Arc::clone(expr)))
+        .collect();
+    build_row_filter_inner(
+        &predicates,
+        file_schema,
+        metadata,
+        reorder_predicates,
+        file_metrics,
+        Some(observer),
+    )
+}
+
+fn build_row_filter_inner(
+    predicates: &[(Option<Tag>, Arc<dyn PhysicalExpr>)],
+    file_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+    observer: Option<&SharedRowFilterObserver>,
 ) -> Result<Option<RowFilter>> {
     let rows_pruned = &file_metrics.pushdown_rows_pruned;
     let rows_matched = &file_metrics.pushdown_rows_matched;
     let time = &file_metrics.row_pushdown_eval_time;
 
-    // Split into conjuncts:
-    // `a = 1 AND b = 2 AND c = 3` -> [`a = 1`, `b = 2`, `c = 3`]
-    let predicates = split_conjunction(expr);
-
-    // Determine which conjuncts can be evaluated as ArrowPredicates, if any
-    let mut candidates: Vec<FilterCandidate> = predicates
-        .into_iter()
-        .map(|expr| {
+    let mut candidates: Vec<(Option<Tag>, FilterCandidate)> = predicates
+        .iter()
+        .map(|(tag, expr)| {
             FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema))
                 .build(metadata)
+                .map(|c| c.map(|c| (*tag, c)))
         })
         .collect::<Result<Vec<_>, _>>()?
         .into_iter()
         .flatten()
         .collect();
 
-    // no candidates
     if candidates.is_empty() {
         return Ok(None);
     }
 
     if reorder_predicates {
-        candidates.sort_unstable_by_key(|c| c.required_bytes);
+        // Reordering applies only at the parquet-rs AND level, so it
+        // is safe to sort by I/O cost across tags too. The observer
+        // sees evaluations in the post-reorder order, which is
+        // exactly the order the rows flow.
+        candidates.sort_unstable_by_key(|(_, c)| c.required_bytes);
     }
 
-    // To avoid double-counting metrics when multiple predicates are used:
-    // - All predicates should count rows_pruned (cumulative pruned rows)
-    // - Only the last predicate should count rows_matched (final result)
-    // This ensures: rows_matched + rows_pruned = total rows processed
     let total_candidates = candidates.len();
 
     candidates
         .into_iter()
         .enumerate()
-        .map(|(idx, candidate)| {
+        .map(|(idx, (tag, candidate))| {
             let is_last = idx == total_candidates - 1;
-
-            // All predicates share the pruned counter (cumulative)
             let predicate_rows_pruned = rows_pruned.clone();
-
-            // Only the last predicate tracks matched rows (final result)
             let predicate_rows_matched = if is_last {
                 rows_matched.clone()
             } else {
                 metrics::Count::new()
             };
 
-            DatafusionArrowPredicate::try_new(
+            DatafusionArrowPredicate::try_new_tagged(
                 candidate,
                 predicate_rows_pruned,
                 predicate_rows_matched,
                 time.clone(),
+                tag,
+                observer.cloned(),
             )
             .map(|pred| Box::new(pred) as _)
         })
@@ -1247,11 +1374,13 @@ mod test {
             .expect("building candidate")
             .expect("candidate expected");
 
-        let mut row_filter = DatafusionArrowPredicate::try_new(
+        let mut row_filter = DatafusionArrowPredicate::try_new_tagged(
             candidate,
             Count::new(),
             Count::new(),
             Time::new(),
+            None,
+            None,
         )
         .expect("creating filter predicate");
 
@@ -1286,11 +1415,13 @@ mod test {
             .expect("building candidate")
             .expect("candidate expected");
 
-        let mut row_filter = DatafusionArrowPredicate::try_new(
+        let mut row_filter = DatafusionArrowPredicate::try_new_tagged(
             candidate,
             Count::new(),
             Count::new(),
             Time::new(),
+            None,
+            None,
         )
         .expect("creating filter predicate");