diff --git a/Cargo.lock b/Cargo.lock index 899b78efcb5e9..b02c4f0275a43 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -156,8 +156,7 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-arith", "arrow-array", @@ -179,8 +178,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -193,8 +191,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "ahash", "arrow-buffer", @@ -203,7 +200,7 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "num-complex", "num-integer", "num-traits", @@ -236,8 +233,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "bytes", "half", @@ -248,8 +244,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -270,8 +265,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-cast", @@ -285,8 +279,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-buffer", "arrow-schema", @@ -298,8 +291,7 @@ dependencies = [ [[package]] name = "arrow-flight" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302b2e036335f3f04d65dad3f74ff1f2aae6dc671d6aa04dc6b61193761e16fb" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-arith", "arrow-array", @@ -326,8 +318,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -342,14 +333,14 @@ dependencies = [ [[package]] name = "arrow-json" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", - "arrow-data", + "arrow-ord", "arrow-schema", + "arrow-select", "chrono", "half", "indexmap 2.14.0", @@ -366,8 +357,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -379,8 +369,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -392,8 +381,7 @@ dependencies = [ [[package]] name = "arrow-schema" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "bitflags", "serde", @@ -404,8 +392,7 @@ dependencies = [ [[package]] name = "arrow-select" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "ahash", "arrow-array", @@ -418,8 +405,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "arrow-array", "arrow-buffer", @@ -434,9 +420,9 @@ dependencies = [ [[package]] name = "astral-tokio-tar" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c23f3af104b40a3430ccb90ed5f7bd877a8dc5c26fc92fde51a22b40890dcf9" +checksum = "4ce73b17c62717c4b6a9af10b43e87c578b0cac27e00666d48304d3b7d2c0693" dependencies = [ "filetime", "futures-core", @@ -3617,9 +3603,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" dependencies = [ "icu_normalizer", "icu_properties", @@ -4360,8 +4346,7 @@ dependencies = [ [[package]] name = "parquet" version = "58.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" +source = "git+https://github.com/pydantic/arrow-rs.git?branch=adaptive-strategy-swap#403183a87964e8d0b4825b03ccbc24c365ab49a8" dependencies = [ "ahash", "arrow-array", @@ -4377,7 +4362,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "lz4_flex", "num-bigint", "num-integer", @@ -6862,9 +6847,9 @@ dependencies = [ [[package]] name = "whoami" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6a5b12f9df4f978d2cfdb1bd3bac52433f44393342d7ee9c25f5a1c14c0f45d" +checksum = "998767ef88740d1f5b0682a9c53c24431453923962269c2db68ee43788c5a40d" dependencies = [ "libc", "libredox", diff --git a/Cargo.toml b/Cargo.toml index 0c768199c9498..af9aa922ca59e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -203,6 +203,34 @@ url = "2.5.7" uuid = "1.23" zstd = { version = "0.13", default-features = false } +# Override arrow / parquet to the `adaptive-strategy-swap` branch on +# pydantic's fork of arrow-rs, which adds the `swap_strategy` API on +# `ParquetPushDecoder` that the in-decoder adaptive filter scheduling +# depends on. +# +# The full set of arrow-rs workspace crates is listed so transitive +# deps (e.g. `arrow-cast` pulled in via `arrow`) resolve to the patched +# version and we don't link two copies into one binary. +# +# Branch: https://github.com/pydantic/arrow-rs/tree/adaptive-strategy-swap +[patch.crates-io] +arrow = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-arith = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-array = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-buffer = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-cast = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-csv = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-data = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-flight = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-ipc = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-json = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-ord = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-row = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-schema = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-select = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +arrow-string = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } +parquet = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" } + [workspace.lints.clippy] # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) large_futures = "warn" diff --git a/datafusion-examples/examples/data_io/json_shredding.rs b/datafusion-examples/examples/data_io/json_shredding.rs index 72fbb56773123..3bef09cb07771 100644 --- a/datafusion-examples/examples/data_io/json_shredding.rs +++ b/datafusion-examples/examples/data_io/json_shredding.rs @@ -92,6 +92,16 @@ pub async fn json_shredding() -> Result<()> { // Set up query execution let mut cfg = SessionConfig::new(); cfg.options_mut().execution.parquet.pushdown_filters = true; + // Force every filter to row-level so the example's + // `pushdown_rows_pruned=1` assertion is deterministic. The default + // adaptive scheduler keeps small-file filters on the post-scan path + // (via the byte-ratio heuristic), where `pushdown_rows_pruned` stays + // 0; setting `filter_pushdown_min_bytes_per_sec = 0` disables that + // heuristic. + cfg.options_mut() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec = 0.0; let ctx = SessionContext::new_with_config(cfg); ctx.runtime_env().register_object_store( ObjectStoreUrl::parse("memory://")?.as_ref(), diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index abd58a556e0a1..a965ac0fcf367 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -919,6 +919,29 @@ config_namespace! { /// parquet reader setting. 0 means no caching. pub max_predicate_cache_size: Option, default = None + /// (reading) Minimum throughput, in bytes per second, that an adaptive + /// row-level filter must sustain to remain at row-level. Filters that + /// drop below this threshold (with statistical confidence — see + /// `filter_confidence_z`) are demoted to post-scan, or dropped entirely + /// if they were optional (e.g. a hash-join build-side dynamic filter). + /// Set to `0` to force every filter to row-level (skip the threshold + /// check); set to `f64::INFINITY` to keep every filter post-scan. + pub filter_pushdown_min_bytes_per_sec: f64, default = 100.0 * 1024.0 * 1024.0 + + /// (reading) Initial-placement heuristic for adaptive filters: when a + /// filter is first observed, place it at row-level if its column bytes + /// are this fraction or less of the total projection's column bytes. + /// Above this ratio, the filter starts as post-scan and only gets + /// promoted later if measured throughput crosses + /// `filter_pushdown_min_bytes_per_sec`. + pub filter_collecting_byte_ratio_threshold: f64, default = 0.20 + + /// (reading) Z-score for the one-sided confidence interval the adaptive + /// filter scheduler uses when promoting / demoting / dropping filters. + /// Default `2.0` (≈ 97.5%) keeps strategy moves conservative; lower the + /// value for snappier adaptation, raise it for more stable placements. + pub filter_confidence_z: f64, default = 2.0 + // The following options affect writing to parquet files // and map to parquet::file::properties::WriterProperties diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index eaf5a1642e8e2..20fc219ecf1c9 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -210,6 +210,10 @@ impl ParquetOptions { coerce_int96: _, // not used for writer props skip_arrow_metadata: _, max_predicate_cache_size: _, + // Read-time adaptive filter knobs; not used for writer props. + filter_pushdown_min_bytes_per_sec: _, + filter_collecting_byte_ratio_threshold: _, + filter_confidence_z: _, } = self; let mut builder = WriterProperties::builder() @@ -483,6 +487,10 @@ mod tests { skip_arrow_metadata: defaults.skip_arrow_metadata, coerce_int96: None, max_predicate_cache_size: defaults.max_predicate_cache_size, + filter_pushdown_min_bytes_per_sec: defaults.filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: defaults + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: defaults.filter_confidence_z, use_content_defined_chunking: defaults.use_content_defined_chunking.clone(), } } @@ -600,6 +608,11 @@ mod tests { binary_as_string: global_options_defaults.binary_as_string, skip_arrow_metadata: global_options_defaults.skip_arrow_metadata, coerce_int96: None, + filter_pushdown_min_bytes_per_sec: global_options_defaults + .filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: global_options_defaults + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: global_options_defaults.filter_confidence_z, use_content_defined_chunking: props.content_defined_chunking().map(|c| { CdcOptions { min_chunk_size: c.min_chunk_size, diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index dd8c20628b43e..1d96018ed8310 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -166,13 +166,19 @@ mod tests { source = source.with_predicate(predicate); } + // The adaptive selectivity tracker subsumes the static + // `reorder_filters` flag. To keep these row-filter-pushdown + // assertions deterministic regardless of the byte-ratio + // heuristic, force every filter to row-level by setting + // `filter_pushdown_min_bytes_per_sec = 0` (the + // "always-row-level" sentinel). The promote/demote behavior + // exercised by other tests is irrelevant here. if self.pushdown_predicate { - source = source - .with_pushdown_filters(true) - .with_reorder_filters(true); - } else { - source = source.with_pushdown_filters(false); + let mut opts = TableParquetOptions::default(); + opts.global.filter_pushdown_min_bytes_per_sec = 0.0; + source = source.with_table_parquet_options(opts); } + source = source.with_pushdown_filters(self.pushdown_predicate); if self.page_index_predicate { source = source.with_enable_page_index(true); diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml index a5855af17a536..d1837f10cb145 100644 --- a/datafusion/datasource-parquet/Cargo.toml +++ b/datafusion/datasource-parquet/Cargo.toml @@ -86,3 +86,7 @@ harness = false [[bench]] name = "parquet_struct_filter_pushdown" harness = false + +[[bench]] +name = "selectivity_tracker" +harness = false diff --git a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs index 02137b5a1d288..cd3d2da56be53 100644 --- a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs +++ b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs @@ -24,6 +24,7 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use criterion::{Criterion, Throughput, criterion_group, criterion_main}; use datafusion_common::ScalarValue; +use datafusion_datasource_parquet::selectivity::SelectivityTracker; use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; use datafusion_expr::{Expr, col}; use datafusion_functions_nested::expr_fn::array_has; @@ -115,9 +116,17 @@ fn scan_with_predicate( let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics); let builder = if pushdown { - if let Some(row_filter) = - build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? - { + let tracker = Arc::new(SelectivityTracker::new()); + let filters = vec![(0usize, Arc::clone(predicate))]; + let (maybe_row_filter, _unbuildable) = build_row_filter( + &filters, + file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + )?; + if let Some(row_filter) = maybe_row_filter { builder.with_row_filter(row_filter) } else { builder diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs index b52408d4222d8..cfc326d84fb6b 100644 --- a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs +++ b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs @@ -50,6 +50,7 @@ use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructA use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use criterion::{Criterion, Throughput, criterion_group, criterion_main}; use datafusion_common::ScalarValue; +use datafusion_datasource_parquet::selectivity::SelectivityTracker; use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; use datafusion_expr::{Expr, col}; use datafusion_physical_expr::planner::logical2physical; @@ -210,9 +211,17 @@ fn scan( let mut filter_applied = false; let builder = if pushdown { - if let Some(row_filter) = - build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? - { + let tracker = Arc::new(SelectivityTracker::new()); + let filters = vec![(0usize, Arc::clone(predicate))]; + let (maybe_row_filter, _unbuildable) = build_row_filter( + &filters, + file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + )?; + if let Some(row_filter) = maybe_row_filter { filter_applied = true; builder.with_row_filter(row_filter) } else { diff --git a/datafusion/datasource-parquet/benches/selectivity_tracker.rs b/datafusion/datasource-parquet/benches/selectivity_tracker.rs new file mode 100644 index 0000000000000..9aee0d09cc68b --- /dev/null +++ b/datafusion/datasource-parquet/benches/selectivity_tracker.rs @@ -0,0 +1,333 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Microbenchmarks for [`SelectivityTracker`] hot paths. +//! +//! These benches isolate the tracker from decoder/IO so we can iterate on +//! its data structures independently. The scenarios model the load a +//! ClickBench-style partitioned query puts on the tracker: +//! +//! - a file is opened and each of its row-group morsels asks the tracker +//! where to place each user filter (`partition_filters`); +//! - inside each morsel the decoder hands us one `RecordBatch` at a time +//! and each batch feeds selectivity stats to the tracker (`update`). +//! +//! With the default ClickBench-partitioned workload (100 files × ~2–3 +//! row-group morsels × ~125 batches-per-morsel × ~1–3 filters-per-query), +//! the `update` path fires tens of thousands of times per query and +//! `partition_filters` fires hundreds — both on the scan critical path. +//! +//! Each bench reports the cost of a single representative operation so +//! the per-query overhead follows by simple multiplication. + +use std::sync::Arc; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_datasource_parquet::selectivity::{ + FilterId, SelectivityTracker, TrackerConfig, +}; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_expr::expressions::Column; +use parquet::basic::{LogicalType, Type as PhysicalType}; +use parquet::file::metadata::{ + ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, +}; +use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor, Type as SchemaType}; + +/// How many files a ClickBench-partitioned query typically opens. +const NUM_FILES: usize = 100; +/// Morsels per file — two full-row-group chunks is typical for hits_partitioned. +const MORSELS_PER_FILE: usize = 3; +/// Batches per morsel (row_group_rows / batch_size ≈ 500k / 8k). +const BATCHES_PER_MORSEL: usize = 60; +/// Filters per query — matches the worst regressed ClickBench queries. +const FILTERS_PER_QUERY: usize = 3; + +fn build_columns(n: usize) -> SchemaDescPtr { + let fields: Vec<_> = (0..n) + .map(|i| { + let name = format!("c{i}"); + SchemaType::primitive_type_builder(&name, PhysicalType::BYTE_ARRAY) + .with_logical_type(Some(LogicalType::String)) + .build() + .unwrap() + .into() + }) + .collect(); + let group = SchemaType::group_type_builder("schema") + .with_fields(fields) + .build() + .unwrap(); + Arc::new(SchemaDescriptor::new(Arc::new(group))) +} + +/// One file with `rg_count` row groups, each nominally `rows_per_rg` rows, +/// `bytes_per_col` compressed bytes per column. +fn build_metadata( + rg_count: usize, + rows_per_rg: i64, + num_cols: usize, + bytes_per_col: i64, +) -> ParquetMetaData { + let schema = build_columns(num_cols); + let row_groups: Vec<_> = (0..rg_count) + .map(|_| { + let cols = (0..num_cols) + .map(|c| { + ColumnChunkMetaData::builder(schema.column(c)) + .set_num_values(rows_per_rg) + .set_total_compressed_size(bytes_per_col) + .build() + .unwrap() + }) + .collect(); + RowGroupMetaData::builder(schema.clone()) + .set_num_rows(rows_per_rg) + .set_column_metadata(cols) + .build() + .unwrap() + }) + .collect(); + let total_rows = rg_count as i64 * rows_per_rg; + let file_meta = FileMetaData::new(1, total_rows, None, None, schema, None); + ParquetMetaData::new(file_meta, row_groups) +} + +/// Produce `F` user filters, each referencing a single column. Column 0 is +/// shared by filter 0 and the projection (filter-in-projection shape, as in +/// ClickBench Q14 `WHERE SearchPhrase <> ''`); the rest sit on columns +/// outside the projection. +fn make_filters(n: usize) -> Vec<(FilterId, Arc)> { + (0..n) + .map(|i| { + let expr: Arc = Arc::new(Column::new(&format!("c{i}"), i)); + (i as FilterId, expr) + }) + .collect() +} + +/// Shared setup: tracker pre-warmed with one `partition_filters` call so +/// the filter stats / state entries exist. Models "second morsel onwards". +fn warm_tracker( + config: TrackerConfig, + filters: &[(FilterId, Arc)], + metadata: &ParquetMetaData, +) -> Arc { + let tracker = Arc::new(config.build()); + // Seed with a round-trip so HashMap entries exist; otherwise the first + // bench iteration pays the "new filter" insertion cost and later ones + // don't. + let _ = tracker.partition_filters( + filters.to_vec(), + &std::collections::HashSet::new(), + 1_000_000, + metadata, + ); + tracker +} + +/// Per-batch `update` cost. This is the tightest loop — it fires once per +/// decoded batch per active filter. At ClickBench scale that's +/// NUM_FILES × MORSELS_PER_FILE × BATCHES_PER_MORSEL × FILTERS = +/// 54,000 calls per query, so every nanosecond here matters. +fn bench_update(c: &mut Criterion) { + let metadata = build_metadata(2, 500_000, 4, 10_000_000); + let filters = make_filters(FILTERS_PER_QUERY); + let tracker = warm_tracker(TrackerConfig::new(), &filters, &metadata); + + let mut group = c.benchmark_group("selectivity_tracker/update"); + group.throughput(criterion::Throughput::Elements(1)); + group.bench_function("single_call", |b| { + let id = filters[0].0; + b.iter(|| { + tracker.update( + std::hint::black_box(id), + std::hint::black_box(4_096), + std::hint::black_box(8_192), + std::hint::black_box(50_000), + std::hint::black_box(65_536), + ); + }) + }); + + // A realistic per-batch hit: we update every active filter for this + // batch. Mirrors `apply_post_scan_filters_with_stats` calling + // `tracker.update` once per filter per batch. + group.bench_function("per_batch_all_filters", |b| { + b.iter(|| { + for (id, _) in &filters { + tracker.update( + std::hint::black_box(*id), + std::hint::black_box(4_096), + std::hint::black_box(8_192), + std::hint::black_box(50_000), + std::hint::black_box(65_536), + ); + } + }) + }); + group.finish(); +} + +/// Per-morsel `partition_filters` cost. Fires once per row-group morsel, +/// so NUM_FILES × MORSELS_PER_FILE ≈ 300 per query. We measure both the +/// "cold" (first) call and the "warm" (re-partition) case. +fn bench_partition_filters(c: &mut Criterion) { + let metadata = build_metadata(2, 500_000, 4, 10_000_000); + let filters = make_filters(FILTERS_PER_QUERY); + let projection_bytes = 40_000_000usize; + + let mut group = c.benchmark_group("selectivity_tracker/partition_filters"); + group.bench_function("cold_first_call", |b| { + b.iter_batched( + || Arc::new(TrackerConfig::new().build()), + |tracker| { + std::hint::black_box(tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + projection_bytes, + &metadata, + )); + }, + criterion::BatchSize::SmallInput, + ) + }); + + // Warm case: tracker already has state for every filter, matches the + // per-morsel path after morsel 0 of any file. + let warm = warm_tracker(TrackerConfig::new(), &filters, &metadata); + group.bench_function("warm_repeat_call", |b| { + b.iter(|| { + std::hint::black_box(warm.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + projection_bytes, + &metadata, + )); + }) + }); + + // Same warm case but after realistic stats have accumulated — this is + // the path that also evaluates the confidence-bound promote/demote + // branches. Seed the tracker with a credible number of `update` calls + // before measuring. + let promoted = warm_tracker(TrackerConfig::new(), &filters, &metadata); + for _ in 0..500 { + for (id, _) in &filters { + promoted.update(*id, 3_000, 8_192, 50_000, 65_536); + } + } + group.bench_function("warm_with_accumulated_stats", |b| { + b.iter(|| { + std::hint::black_box(promoted.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + projection_bytes, + &metadata, + )); + }) + }); + group.finish(); +} + +/// End-to-end "one file open" cost: one `partition_filters` per morsel +/// plus `update` per batch per filter. This matches what a single +/// ClickBench-partitioned file inflicts on the tracker and lets us read +/// the combined improvement from any optimization in one number. +fn bench_file_scan_simulation(c: &mut Criterion) { + let metadata = build_metadata(2, 500_000, 4, 10_000_000); + let filters = make_filters(FILTERS_PER_QUERY); + let projection_bytes = 40_000_000usize; + let warm = warm_tracker(TrackerConfig::new(), &filters, &metadata); + + let mut group = c.benchmark_group("selectivity_tracker/file_scan"); + group.throughput(criterion::Throughput::Elements( + (MORSELS_PER_FILE * BATCHES_PER_MORSEL * FILTERS_PER_QUERY) as u64, + )); + group.bench_function("one_file", |b| { + b.iter(|| { + for _morsel in 0..MORSELS_PER_FILE { + std::hint::black_box(warm.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + projection_bytes, + &metadata, + )); + for _batch in 0..BATCHES_PER_MORSEL { + for (id, _) in &filters { + warm.update(*id, 3_000, 8_192, 50_000, 65_536); + } + } + } + }) + }); + group.finish(); +} + +/// Full-query simulation: [`NUM_FILES`] sequential file scans on a single +/// tracker instance. Closest approximation to the per-query tracker cost +/// a ClickBench user sees. +/// +/// Parameterised on morsels-per-file so we can see how sensitive the +/// total cost is to the morsel-split fan-out. +fn bench_query_simulation(c: &mut Criterion) { + let metadata = build_metadata(2, 500_000, 4, 10_000_000); + let filters = make_filters(FILTERS_PER_QUERY); + let projection_bytes = 40_000_000usize; + + let mut group = c.benchmark_group("selectivity_tracker/query"); + group.sample_size(20); + for morsels in [1usize, 2, 3, 5] { + group.bench_with_input( + BenchmarkId::from_parameter(morsels), + &morsels, + |b, &morsels_per_file| { + b.iter_batched( + || Arc::new(TrackerConfig::new().build()), + |tracker| { + for _file in 0..NUM_FILES { + for _morsel in 0..morsels_per_file { + std::hint::black_box(tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + projection_bytes, + &metadata, + )); + for _batch in 0..BATCHES_PER_MORSEL { + for (id, _) in &filters { + tracker.update(*id, 3_000, 8_192, 50_000, 65_536); + } + } + } + } + }, + criterion::BatchSize::SmallInput, + ) + }, + ); + } + group.finish(); +} + +criterion_group!( + benches, + bench_update, + bench_partition_filters, + bench_file_scan_simulation, + bench_query_simulation, +); +criterion_main!(benches); diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 7dda7b1b12811..2feb7814c5733 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -501,6 +501,12 @@ impl FileFormat for ParquetFormat { ) -> Result> { let mut metadata_size_hint = None; + let filter_pushdown_min_bytes_per_sec = state + .config_options() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec; + if let Some(metadata) = self.metadata_size_hint() { metadata_size_hint = Some(metadata); } @@ -510,7 +516,10 @@ impl FileFormat for ParquetFormat { .downcast_ref::() .cloned() .ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?; - source = source.with_table_parquet_options(self.options.clone()); + let mut options = self.options.clone(); + options.global.filter_pushdown_min_bytes_per_sec = + filter_pushdown_min_bytes_per_sec; + source = source.with_table_parquet_options(options); // Use the CachedParquetFileReaderFactory let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 8eb5912b919da..ece86af498f82 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -91,6 +91,8 @@ pub struct ParquetFileMetrics { /// number of rows that were stored in the cache after evaluating predicates /// reused for the output. pub predicate_cache_records: Gauge, + //// Time spent applying filters + pub filter_apply_time: Time, } impl ParquetFileMetrics { @@ -192,6 +194,10 @@ impl ParquetFileMetrics { .with_category(MetricCategory::Rows) .gauge("predicate_cache_records", partition); + let filter_apply_time = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .subset_time("filter_apply_time", partition); + Self { files_ranges_pruned_statistics, predicate_evaluation_errors, @@ -211,6 +217,7 @@ impl ParquetFileMetrics { scan_efficiency_ratio, predicate_cache_inner_records, predicate_cache_records, + filter_apply_time, } } } diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index 9a907f4118a86..eb81383a93ca7 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -33,6 +33,7 @@ mod page_filter; mod reader; mod row_filter; mod row_group_filter; +pub mod selectivity; mod sort; pub mod source; mod supported_predicates; diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index bad1c684b47f5..7f1c8ec476db7 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -68,14 +68,16 @@ use parquet::DecodeResult; use parquet::arrow::ParquetRecordBatchStreamBuilder; use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; use parquet::arrow::arrow_reader::{ - ArrowReaderMetadata, ArrowReaderOptions, RowSelectionPolicy, + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, RowSelectionPolicy, }; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::parquet_column; -use parquet::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; +use parquet::arrow::push_decoder::{ + ParquetPushDecoder, ParquetPushDecoderBuilder, StrategySwap, +}; use parquet::basic::Type; use parquet::bloom_filter::Sbbf; -use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; /// Stateless Parquet morselizer implementation. /// @@ -94,8 +96,16 @@ pub(super) struct ParquetMorselizer { pub(crate) limit: Option, /// If should keep the output rows in order pub preserve_order: bool, - /// Optional predicate to apply during the scan - pub predicate: Option>, + /// Optional predicate conjuncts for row filtering during the scan. + /// Each conjunct is tagged with a stable `FilterId` for the adaptive + /// [`crate::selectivity::SelectivityTracker`] so per-filter stats + /// accumulate across files. + pub predicate_conjuncts: + Option)>>, + /// Adaptive selectivity tracker shared across files. Each opener feeds + /// per-batch stats and asks for the current optimal split between + /// row-level and post-scan placement at row-group boundaries. + pub selectivity_tracker: Arc, /// Table schema, including partition columns. pub table_schema: TableSchema, /// Optional hint for how large the initial request to read parquet metadata @@ -108,8 +118,6 @@ pub(super) struct ParquetMorselizer { /// Should the filters be evaluated during the parquet scan using /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)? pub pushdown_filters: bool, - /// Should the filters be reordered to optimize the scan? - pub reorder_filters: bool, /// Should we force the reader to use RowSelections for filtering pub force_filter_selections: bool, /// Should the page index be read from parquet files, if present, to skip @@ -273,8 +281,15 @@ struct PreparedParquetOpen { physical_file_schema: SchemaRef, output_schema: SchemaRef, projection: ProjectionExprs, - predicate: Option>, - reorder_predicates: bool, + /// Predicate conjuncts with stable `FilterId`s for the adaptive + /// selectivity tracker. Carried forward from + /// [`ParquetMorselizer::predicate_conjuncts`]. The combined predicate + /// (used for pruning and `FilePruner`) is recomputed on demand from + /// these conjuncts. + predicate_conjuncts: + Option)>>, + /// Shared adaptive selectivity tracker. + selectivity_tracker: Arc, pushdown_filters: bool, force_filter_selections: bool, enable_page_index: bool, @@ -597,22 +612,41 @@ impl ParquetMorselizer { )); let mut projection = self.projection.clone(); - let mut predicate = self.predicate.clone(); + let mut predicate_conjuncts = self.predicate_conjuncts.clone(); if !literal_columns.is_empty() { projection = projection.try_map_exprs(|expr| { replace_columns_with_literals(Arc::clone(&expr), &literal_columns) })?; - predicate = predicate - .map(|p| replace_columns_with_literals(p, &literal_columns)) - .transpose()?; + // Rewrite each conjunct individually so per-conjunct FilterIds + // remain stable and continue to refer to the same expression + // across files (modulo literal substitution). + if let Some(ref mut conjuncts) = predicate_conjuncts { + for (_id, expr) in conjuncts.iter_mut() { + *expr = replace_columns_with_literals( + Arc::clone(expr), + &literal_columns, + )?; + } + } } let predicate_creation_errors = MetricBuilder::new(&self.metrics) .with_category(MetricCategory::Rows) .global_counter("num_predicate_creation_errors"); + // Combine conjuncts into a single AND-ed predicate for the file-level + // pruner and for early statistics-driven elimination. The file + // pruner does not need per-conjunct identities — only a boolean + // expression over file-level columns and partition values. + let combined_predicate: Option> = + predicate_conjuncts.as_ref().map(|conjuncts| { + datafusion_physical_expr::conjunction( + conjuncts.iter().map(|(_, e)| Arc::clone(e)), + ) + }); + // Apply literal replacements to projection and predicate - let file_pruner = predicate + let file_pruner = combined_predicate .as_ref() .filter(|p| is_dynamic_physical_expr(p) || partitioned_file.has_statistics()) .and_then(|p| { @@ -642,8 +676,8 @@ impl ParquetMorselizer { physical_file_schema: logical_file_schema, output_schema, projection, - predicate, - reorder_predicates: self.reorder_filters, + predicate_conjuncts, + selectivity_tracker: Arc::clone(&self.selectivity_tracker), pushdown_filters: self.pushdown_filters, force_filter_selections: self.force_filter_selections, enable_page_index: self.enable_page_index, @@ -663,6 +697,17 @@ impl ParquetMorselizer { } impl PreparedParquetOpen { + /// Reconstruct a single AND-ed predicate from the per-conjunct list. + /// Used for pruning, page-index setup, and `FilePruner` construction — + /// callers that don't care about the per-conjunct `FilterId` identities. + fn combined_predicate(&self) -> Option> { + self.predicate_conjuncts.as_ref().map(|conjuncts| { + datafusion_physical_expr::conjunction( + conjuncts.iter().map(|(_, e)| Arc::clone(e)), + ) + }) + } + /// Attempt file-level pruning before any metadata is loaded. /// /// Returns `None` if the file can be skipped completely. @@ -804,7 +849,7 @@ impl MetadataLoadedParquetOpen { // columns are appended after file columns in the table schema), // types are the same, and there are no missing columns. Skip the // tree walk entirely in that case. - let needs_rewrite = prepared.predicate.is_some() + let needs_rewrite = prepared.predicate_conjuncts.is_some() || prepared.logical_file_schema != physical_file_schema; if needs_rewrite { let rewriter = prepared.expr_adapter_factory.create( @@ -812,26 +857,34 @@ impl MetadataLoadedParquetOpen { Arc::clone(&physical_file_schema), )?; let simplifier = PhysicalExprSimplifier::new(&physical_file_schema); - prepared.predicate = prepared - .predicate - .map(|p| simplifier.simplify(rewriter.rewrite(p)?)) - .transpose()?; + // Rewrite each conjunct individually so per-conjunct FilterIds + // remain stable across files. + if let Some(ref mut conjuncts) = prepared.predicate_conjuncts { + for (_, expr) in conjuncts.iter_mut() { + *expr = simplifier.simplify(rewriter.rewrite(Arc::clone(expr))?)?; + } + } prepared.projection = prepared .projection .try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?; } prepared.physical_file_schema = Arc::clone(&physical_file_schema); + // Combined AND-ed predicate is only used for pruning / page-index + // setup; the conjunct-level identities are preserved separately for + // the adaptive selectivity tracker. + let combined_predicate = prepared.combined_predicate(); + // Build predicates for this specific file let pruning_predicate = build_pruning_predicates( - prepared.predicate.as_ref(), + combined_predicate.as_ref(), &physical_file_schema, &prepared.predicate_creation_errors, ); // Only build page pruning predicate if page index is enabled let page_pruning_predicate = if prepared.enable_page_index { - prepared.predicate.as_ref().and_then(|predicate| { + combined_predicate.as_ref().and_then(|predicate| { let p = build_page_pruning_predicate(predicate, &physical_file_schema); (p.filter_number() > 0).then_some(p) }) @@ -1075,25 +1128,75 @@ impl RowGroupsPrunedParquetOpen { let file_metadata = Arc::clone(reader_metadata.metadata()); let rg_metadata = file_metadata.row_groups(); - // Filter pushdown: evaluate predicates during scan - let row_filter = if let Some(predicate) = prepared - .pushdown_filters - .then_some(prepared.predicate.clone()) - .flatten() + // Adaptive filter placement at file open. Ask the shared + // `SelectivityTracker` to split predicate conjuncts (already adapted + // to `physical_file_schema`) into row-level and post-scan buckets + // based on stats accumulated across earlier files. The same split + // is re-evaluated mid-stream at row-group boundaries via + // `AdaptiveParquetStream::maybe_swap_strategy`. + // + // The set of leaf-column indices in the user projection — passed + // to the tracker so its byte-ratio heuristic only counts filter + // columns *not already in the projection* (a column that's in + // the projection costs zero extra I/O to push down). + let projection_columns: std::collections::HashSet = + datafusion_physical_expr::utils::collect_columns( + &datafusion_physical_expr::conjunction(prepared.projection.expr_iter()), + ) + .iter() + .map(|c| c.index()) + .collect(); + let projection_compressed_bytes = row_filter::total_compressed_bytes( + &projection_columns.iter().copied().collect::>(), + file_metadata.as_ref(), + ); + + let (row_filter_conjuncts, mut post_scan_conjuncts) = if prepared.pushdown_filters + && let Some(conjuncts) = prepared.predicate_conjuncts.clone() + && !conjuncts.is_empty() { - let row_filter = row_filter::build_row_filter( - &predicate, - &prepared.physical_file_schema, + let partitioned = prepared.selectivity_tracker.partition_filters( + conjuncts, + &projection_columns, + projection_compressed_bytes, file_metadata.as_ref(), - prepared.reorder_predicates, - &prepared.file_metrics, ); + (partitioned.row_filters, partitioned.post_scan) + } else { + (Vec::new(), Vec::new()) + }; - match row_filter { - Ok(Some(filter)) => Some(filter), - Ok(None) => None, + // Build row-level `ArrowPredicate`s for the row_filters bucket. Any + // conjunct that `build_row_filter` reports as `unbuildable` falls + // through to the post-scan bucket so we never silently drop a + // filter — dropping would relax the user's predicate and return + // wrong results. + // Capture the row-filter id set before any potential move into + // `post_scan_conjuncts` on the error fall-through below, so the + // adaptive stream can detect placement changes against this baseline. + let initial_row_filter_ids: std::collections::BTreeSet< + crate::selectivity::FilterId, + > = row_filter_conjuncts.iter().map(|(id, _)| *id).collect(); + + let row_filter = if !row_filter_conjuncts.is_empty() { + match row_filter::build_row_filter( + &row_filter_conjuncts, + &prepared.physical_file_schema, + file_metadata.as_ref(), + projection_compressed_bytes, + &prepared.selectivity_tracker, + &prepared.file_metrics, + ) { + Ok((row_filter, unbuildable)) => { + post_scan_conjuncts.extend(unbuildable); + row_filter + } Err(e) => { - debug!("Ignoring error building row filter for '{predicate:?}': {e}"); + debug!( + "Error building row filter for {row_filter_conjuncts:?}: {e}; \ + falling all row-filter candidates through to post-scan" + ); + post_scan_conjuncts.extend(row_filter_conjuncts); None } } @@ -1101,6 +1204,16 @@ impl RowGroupsPrunedParquetOpen { None }; + // If the build above failed and dropped every row-filter candidate + // into post-scan, treat the active set as empty so the first + // mid-stream swap will rebuild from scratch using whatever the + // tracker decides next. + let active_row_filter_ids = if row_filter.is_some() { + initial_row_filter_ids + } else { + std::collections::BTreeSet::new() + }; + // Prune by limit if limit is set and limit order is not sensitive if let (Some(limit), false) = (prepared.limit, prepared.preserve_order) { row_groups.prune_by_limit(limit, rg_metadata, &prepared.file_metrics); @@ -1133,12 +1246,79 @@ impl RowGroupsPrunedParquetOpen { } let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + + // Build the decoder's projection over the UNION of the user + // projection and **every** predicate conjunct's columns, regardless + // of whether each conjunct is currently row-level or post-scan. + // + // Why all conjuncts (not just post-scan): a mid-stream + // `maybe_swap_strategy` call can demote a row-level filter to + // post-scan when its measured throughput drops below + // `min_bytes_per_sec`. The decoder's projection mask is fixed for + // the file (we don't grow it on swap), so any column that *might* + // be referenced by a post-scan filter at some point during the + // file must already be in the mask — otherwise the post-scan + // rebase fails with a schema-lookup error. + // + // Filter-only columns are stripped when the projector runs after + // post-scan filters, so the user-visible output schema is + // unchanged. let read_plan = build_projection_read_plan( - prepared.projection.expr_iter(), + prepared.projection.expr_iter().chain( + prepared + .predicate_conjuncts + .iter() + .flatten() + .map(|(_, expr)| Arc::clone(expr)), + ), &prepared.physical_file_schema, reader_metadata.parquet_schema(), ); + let total_rows: i64 = file_metadata + .row_groups() + .iter() + .map(|rg| rg.num_rows()) + .sum(); + + let post_scan_other_bytes_per_row: Vec = post_scan_conjuncts + .iter() + .map(|(_, expr)| { + let filter_cols: Vec = + datafusion_physical_expr::utils::collect_columns(expr) + .iter() + .map(|c| c.index()) + .collect(); + let filter_compressed = row_filter::total_compressed_bytes( + &filter_cols, + file_metadata.as_ref(), + ); + if total_rows > 0 { + projection_compressed_bytes.saturating_sub(filter_compressed) as f64 + / total_rows as f64 + } else { + 0.0 + } + }) + .collect(); + + let stream_schema = Arc::clone(&read_plan.projected_schema); + let replace_schema = stream_schema != prepared.output_schema; + + let rebased_projection = prepared + .projection + .clone() + .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; + let post_scan_filters: Vec<( + crate::selectivity::FilterId, + Arc, + )> = post_scan_conjuncts + .into_iter() + .map(|(id, expr)| { + reassign_expr_columns(expr, &stream_schema).map(|e| (id, e)) + }) + .collect::>()?; + let mut decoder_builder = ParquetPushDecoderBuilder::new_with_metadata(reader_metadata) .with_projection(read_plan.projection_mask) @@ -1157,7 +1337,13 @@ impl RowGroupsPrunedParquetOpen { } decoder_builder = decoder_builder.with_row_groups(prepared_plan.row_group_indexes); - if let Some(limit) = prepared.limit { + // Decoder-level limit only applies cleanly when there are no + // post-scan filters; otherwise the decoder might short-circuit + // before post-scan filtering would have rejected enough rows. The + // outer FileStream limit still bounds total output. + if let Some(limit) = prepared.limit + && post_scan_filters.is_empty() + { decoder_builder = decoder_builder.with_limit(limit); } if let Some(max_predicate_cache_size) = prepared.max_predicate_cache_size { @@ -1171,26 +1357,29 @@ impl RowGroupsPrunedParquetOpen { prepared.file_metrics.predicate_cache_inner_records.clone(); let predicate_cache_records = prepared.file_metrics.predicate_cache_records.clone(); + let filter_apply_time = prepared.file_metrics.filter_apply_time.clone(); - // Check if we need to replace the schema to handle things like differing nullability or metadata. - // See note below about file vs. output schema. - let stream_schema = read_plan.projected_schema; - let replace_schema = stream_schema != prepared.output_schema; - - // Rebase column indices to match the narrowed stream schema. - // The projection expressions have indices based on physical_file_schema, - // but the stream only contains the columns selected by the ProjectionMask. - let projection = prepared - .projection - .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; - let projector = projection.make_projector(&stream_schema)?; + let projector = rebased_projection.make_projector(&stream_schema)?; let output_schema = Arc::clone(&prepared.output_schema); let files_ranges_pruned_statistics = prepared.file_metrics.files_ranges_pruned_statistics.clone(); let stream = futures::stream::unfold( - PushDecoderStreamState { + AdaptiveParquetStream { decoder, reader: prepared.async_file_reader, + active_reader: None, + file_metadata: Arc::clone(&file_metadata), + physical_file_schema: Arc::clone(&prepared.physical_file_schema), + stream_schema: Arc::clone(&stream_schema), + file_metrics: prepared.file_metrics.clone(), + tracker: Arc::clone(&prepared.selectivity_tracker), + all_conjuncts: prepared.predicate_conjuncts.unwrap_or_default(), + projection_columns, + projection_compressed_bytes, + active_row_filter_ids, + post_scan_filters, + post_scan_other_bytes_per_row, + filter_apply_time, projector, output_schema, replace_schema, @@ -1198,6 +1387,7 @@ impl RowGroupsPrunedParquetOpen { predicate_cache_inner_records, predicate_cache_records, baseline_metrics: prepared.baseline_metrics, + pushdown_filters: prepared.pushdown_filters, }, |state| async move { state.transition().await }, ) @@ -1218,15 +1408,71 @@ impl RowGroupsPrunedParquetOpen { } } -/// State for a stream that decodes a single Parquet file using a push-based decoder. +/// State for a stream that decodes a single Parquet file with adaptive +/// filter scheduling. /// -/// The [`transition`](Self::transition) method drives the decoder in a loop: it requests -/// byte ranges from the [`AsyncFileReader`], pushes the fetched data into the -/// [`ParquetPushDecoder`], and yields projected [`RecordBatch`]es until the file is -/// fully consumed. -struct PushDecoderStreamState { +/// The [`transition`](Self::transition) method drives one row group at a +/// time: +/// +/// 1. Pull a [`ParquetRecordBatchReader`] for the next row group via +/// [`ParquetPushDecoder::try_next_reader`], fetching ranges as needed. +/// 2. Iterate the reader synchronously; each batch goes through any +/// post-scan filters (which feed per-filter stats into the shared +/// [`SelectivityTracker`](crate::selectivity::SelectivityTracker)) and +/// then through the projector. +/// 3. When the reader exhausts, ask the tracker to re-partition filters +/// based on accumulated stats. If the placement changed, build a new +/// `RowFilter` and call [`ParquetPushDecoder::swap_strategy`] before +/// requesting the next row group. +/// +/// Why one decoder per file (vs the chunk-per-row-group split in PR #9): +/// - Reuses arrow-rs `PushBuffers` across row groups so already-fetched +/// bytes that survive a strategy swap aren't re-requested. +/// - Avoids per-chunk reader minting and per-chunk `RowFilter` rebuild +/// (`RowFilter` is `!Clone`). +/// - One [`EarlyStoppingStream`] wrap covers the whole file — no +/// chunk-0-only special case for the non-`Clone` `FilePruner`. +struct AdaptiveParquetStream { decoder: ParquetPushDecoder, reader: Box, + /// Active row-group reader. `None` between row groups (when a swap + /// can be applied) and at start. + active_reader: Option, + /// Parquet metadata for the file. Used by the tracker to size filter + /// vs projection bytes when re-partitioning. + file_metadata: Arc, + /// Schema used for filter expressions before rebase. + physical_file_schema: SchemaRef, + /// Wide schema the decoder yields — including post-scan-filter columns + /// not in the user projection. Stable across the file even when a + /// strategy swap moves filters around. + stream_schema: SchemaRef, + file_metrics: ParquetFileMetrics, + tracker: Arc, + /// Full set of predicate conjuncts for this file (with stable + /// FilterIds), re-fed to `partition_filters` at every row-group + /// boundary. + all_conjuncts: Vec<(crate::selectivity::FilterId, Arc)>, + /// Leaf-column indices in the user projection — passed to the tracker + /// so its byte-ratio heuristic can subtract overlap with the + /// projection (a filter column already in the projection costs no + /// extra I/O at row-level). + projection_columns: std::collections::HashSet, + /// Total compressed bytes for the user projection. Constant across + /// the file; reused at every swap decision. + projection_compressed_bytes: usize, + /// Set of FilterIds currently applied as row-level predicates inside + /// the decoder. A change in this set drives the swap. + active_row_filter_ids: std::collections::BTreeSet, + /// Post-scan filters expressed against `stream_schema`. Updated on + /// swap. + post_scan_filters: Vec<(crate::selectivity::FilterId, Arc)>, + /// Per-post-scan-filter "other-bytes-per-row" cost metric — bytes of + /// projection columns *not* referenced by this filter, amortised. + /// Same units as the row-filter path's `other_projected_bytes_per_row` + /// so promote/demote rankings compare on a single axis. + post_scan_other_bytes_per_row: Vec, + filter_apply_time: datafusion_physical_plan::metrics::Time, projector: Projector, output_schema: Arc, replace_schema: bool, @@ -1234,59 +1480,210 @@ struct PushDecoderStreamState { predicate_cache_inner_records: Gauge, predicate_cache_records: Gauge, baseline_metrics: BaselineMetrics, + /// Whether filter pushdown is enabled for this file. When `false`, + /// `swap_strategy` is never called and `post_scan_filters` is empty. + pushdown_filters: bool, } -impl PushDecoderStreamState { - /// Advances the decoder state machine until the next [`RecordBatch`] is - /// produced, the file is fully consumed, or an error occurs. +impl AdaptiveParquetStream { + /// Advances the state machine until the next batch is produced, the + /// file is fully consumed, or an error occurs. Drives one row group + /// at a time, swapping filter strategy at row-group boundaries. /// - /// On each iteration the decoder is polled via [`ParquetPushDecoder::try_decode`]: - /// - [`NeedsData`](DecodeResult::NeedsData) – the requested byte ranges are - /// fetched from the [`AsyncFileReader`] and fed back into the decoder. - /// - [`Data`](DecodeResult::Data) – a decoded batch is projected and returned. - /// - [`Finished`](DecodeResult::Finished) – signals end-of-stream (`None`). - /// - /// Takes `self` by value (rather than `&mut self`) so the generated future - /// owns the state directly. This avoids a Stacked Borrows violation under - /// miri where `&mut self` creates a single opaque borrow that conflicts - /// with `unfold`'s ownership across yield points. + /// Takes `self` by value so the generated future owns the state + /// directly — same rationale as the previous `PushDecoderStreamState`: + /// `&mut self` creates a Stacked Borrows conflict with `unfold`'s + /// ownership across yield points under miri. async fn transition(mut self) -> Option<(Result, Self)> { loop { - match self.decoder.try_decode() { - Ok(DecodeResult::NeedsData(ranges)) => { - let data = self - .reader - .get_byte_ranges(ranges.clone()) - .await - .map_err(DataFusionError::from); - match data { - Ok(data) => { - if let Err(e) = self.decoder.push_ranges(ranges, data) { - return Some((Err(DataFusionError::from(e)), self)); + // Step 1: ensure we have a reader for the current row group. + if self.active_reader.is_none() { + // Re-evaluate filter placement at every row-group boundary. + if self.pushdown_filters + && let Err(e) = self.maybe_swap_strategy() + { + return Some((Err(e), self)); + } + // Pull the next reader, fetching data as needed. + loop { + match self.decoder.try_next_reader() { + Ok(DecodeResult::NeedsData(ranges)) => { + match self.reader.get_byte_ranges(ranges.clone()).await { + Ok(data) => { + if let Err(e) = self.decoder.push_ranges(ranges, data) + { + return Some(( + Err(DataFusionError::from(e)), + self, + )); + } + } + Err(e) => { + return Some((Err(DataFusionError::from(e)), self)); + } } } - Err(e) => return Some((Err(e), self)), + Ok(DecodeResult::Data(reader)) => { + self.active_reader = Some(reader); + break; + } + Ok(DecodeResult::Finished) => return None, + Err(e) => return Some((Err(DataFusionError::from(e)), self)), } } - Ok(DecodeResult::Data(batch)) => { - let mut timer = self.baseline_metrics.elapsed_compute().timer(); - self.copy_arrow_reader_metrics(); - let result = self.project_batch(&batch); + } + + // Step 2: pull the next batch out of the active reader. Reader + // iteration is synchronous because all bytes for the row group + // were already pushed before the reader was constructed. + let batch_result = self + .active_reader + .as_mut() + .expect("active_reader set above") + .next(); + let batch = match batch_result { + Some(Ok(batch)) => batch, + Some(Err(e)) => return Some((Err(DataFusionError::from(e)), self)), + None => { + // Row group exhausted — drop the reader so the next + // iteration goes back to step 1 and considers a swap. + self.active_reader = None; + continue; + } + }; + + // Step 3: post-scan filters + projector + schema replacement. + let mut timer = self.baseline_metrics.elapsed_compute().timer(); + self.copy_arrow_reader_metrics(); + let filtered = if self.post_scan_filters.is_empty() { + Ok(batch) + } else { + let start = datafusion_common::instant::Instant::now(); + let r = apply_post_scan_filters_with_stats( + batch, + &self.post_scan_filters, + &self.post_scan_other_bytes_per_row, + &self.tracker, + ); + self.filter_apply_time.add_elapsed(start); + r + }; + match filtered { + // Post-scan may filter every row in a batch. Skip empty + // outputs so the consumer doesn't see noise batches. + Ok(b) if b.num_rows() == 0 => { + timer.stop(); + continue; + } + Ok(b) => { + let result = self.project_batch(&b); timer.stop(); - // Release the borrow on baseline_metrics before moving self drop(timer); return Some((result, self)); } - Ok(DecodeResult::Finished) => { - return None; - } Err(e) => { - return Some((Err(DataFusionError::from(e)), self)); + timer.stop(); + drop(timer); + return Some((Err(e), self)); } } } } + /// Re-evaluate filter placement at a row-group boundary. If the + /// row-filter set has changed, build a new `RowFilter` and apply it + /// via [`ParquetPushDecoder::swap_strategy`]. Updates + /// `post_scan_filters` and `post_scan_other_bytes_per_row` to reflect + /// the new partition. + /// + /// No-op when the decoder isn't at a swap point or there are no + /// conjuncts. + fn maybe_swap_strategy(&mut self) -> Result<()> { + if !self.decoder.can_swap_strategy() || self.all_conjuncts.is_empty() { + return Ok(()); + } + let partitioned = self.tracker.partition_filters( + self.all_conjuncts.clone(), + &self.projection_columns, + self.projection_compressed_bytes, + self.file_metadata.as_ref(), + ); + let new_ids: std::collections::BTreeSet = + partitioned.row_filters.iter().map(|(id, _)| *id).collect(); + if new_ids == self.active_row_filter_ids { + // Placement unchanged for the row-filter set. Post-scan and + // dropped filters can change with stats but they don't need a + // decoder-level swap — `apply_post_scan_filters_with_stats` + // already consults `tracker.is_filter_skipped` per batch. + return Ok(()); + } + + // Rebuild the row filter from the new row-level set. + let (row_filter, unbuildable) = row_filter::build_row_filter( + &partitioned.row_filters, + &self.physical_file_schema, + self.file_metadata.as_ref(), + self.projection_compressed_bytes, + &self.tracker, + &self.file_metrics, + )?; + + // Combine post-scan + unbuildable into the new post-scan set, + // then rebase against the (stable) `stream_schema` and recompute + // bytes-per-row metrics. + let mut post_scan = partitioned.post_scan; + post_scan.extend(unbuildable); + + let total_rows: i64 = self + .file_metadata + .row_groups() + .iter() + .map(|rg| rg.num_rows()) + .sum(); + + let post_scan_other_bytes_per_row: Vec = post_scan + .iter() + .map(|(_, expr)| { + let cols: Vec = + datafusion_physical_expr::utils::collect_columns(expr) + .iter() + .map(|c| c.index()) + .collect(); + let filter_compressed = row_filter::total_compressed_bytes( + &cols, + self.file_metadata.as_ref(), + ); + if total_rows > 0 { + self.projection_compressed_bytes + .saturating_sub(filter_compressed) as f64 + / total_rows as f64 + } else { + 0.0 + } + }) + .collect(); + + let post_scan_rebased: Vec<( + crate::selectivity::FilterId, + Arc, + )> = post_scan + .into_iter() + .map(|(id, expr)| { + reassign_expr_columns(expr, &self.stream_schema).map(|e| (id, e)) + }) + .collect::>()?; + + self.active_row_filter_ids = new_ids; + self.post_scan_filters = post_scan_rebased; + self.post_scan_other_bytes_per_row = post_scan_other_bytes_per_row; + + // `with_filter(Some(rf))` installs the new filter; `with_filter(None)` + // clears it (when every conjunct moved out of row-level placement). + self.decoder + .swap_strategy(StrategySwap::new().with_filter(row_filter)) + .map_err(DataFusionError::from) + } + /// Copies metrics from ArrowReaderMetrics (the metrics collected by the /// arrow-rs parquet reader) to the parquet file metrics for DataFusion fn copy_arrow_reader_metrics(&self) { @@ -1301,14 +1698,6 @@ impl PushDecoderStreamState { fn project_batch(&self, batch: &RecordBatch) -> Result { let mut batch = self.projector.project_batch(batch)?; if self.replace_schema { - // Ensure the output batch has the expected schema. - // This handles things like schema level and field level metadata, which may not be present - // in the physical file schema. - // It is also possible for nullability to differ; some writers create files with - // OPTIONAL fields even when there are no nulls in the data. - // In these cases it may make sense for the logical schema to be `NOT NULL`. - // RecordBatch::try_new_with_options checks that if the schema is NOT NULL - // the array cannot contain nulls, amongst other checks. let (_stream_schema, arrays, num_rows) = batch.into_parts(); let options = RecordBatchOptions::new().with_row_count(Some(num_rows)); batch = RecordBatch::try_new_with_options( @@ -1321,6 +1710,79 @@ impl PushDecoderStreamState { } } +/// Apply a list of post-scan filters to a batch in order, AND-ing their +/// boolean masks. Each filter's evaluation reports stats to the shared +/// [`SelectivityTracker`](crate::selectivity::SelectivityTracker) in the +/// same units as the row-filter path so promote/demote decisions can +/// compare row-level and post-scan filter effectiveness on one axis. +/// +/// `other_bytes_per_row[i]` is the bytes-per-row of the projection columns +/// *not* referenced by `filters[i]` — i.e. the late-materialization saving +/// per pruned row. +fn apply_post_scan_filters_with_stats( + batch: RecordBatch, + filters: &[(crate::selectivity::FilterId, Arc)], + other_bytes_per_row: &[f64], + tracker: &crate::selectivity::SelectivityTracker, +) -> Result { + use arrow::array::BooleanArray; + use arrow::compute::{and, filter_record_batch}; + use datafusion_common::cast::as_boolean_array; + + if batch.num_rows() == 0 { + return Ok(batch); + } + + let input_rows = batch.num_rows() as u64; + let mut combined_mask: Option = None; + + for (i, (id, expr)) in filters.iter().enumerate() { + // Mid-stream skip: the tracker sets this flag on + // `OptionalFilterPhysicalExpr` whose CI upper bound has fallen + // below `min_bytes_per_sec`. Correctness is preserved because the + // originating join independently enforces the predicate. We do + // not update the tracker for a skipped batch. + if tracker.is_filter_skipped(*id) { + continue; + } + + // Per-batch tracker bookkeeping. We measure every batch (no + // sampling): the `Instant + tracker.update` path is hot, but + // skipping samples delays first-promotion by N× and that + // dominates the steady-state lock contention on + // strongly-selective queries (Q22 / Q23 / Q24). The Welford + // accumulator converges within the first row group either way. + let start = datafusion_common::instant::Instant::now(); + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let bool_arr = as_boolean_array(result.as_ref())?; + let nanos = start.elapsed().as_nanos() as u64; + let num_matched = bool_arr.true_count() as u64; + + // Convert the raw "all the non-filter projection bytes for + // this batch" into a *scatter-aware* skippable count: only + // the sub-windows of the bool array with zero survivors + // represent decode work that late-materialization would + // actually skip. A 50% filter on uniform data scores 0 + // here; a 50% filter on contiguous data scores ~0.5. + let total_other_bytes = (other_bytes_per_row[i] * input_rows as f64) as u64; + let skippable_bytes = + crate::selectivity::count_skippable_bytes(bool_arr, total_other_bytes); + tracker.update(*id, num_matched, input_rows, nanos, skippable_bytes); + + if num_matched < input_rows { + combined_mask = Some(match combined_mask { + Some(prev) => and(&prev, bool_arr)?, + None => bool_arr.clone(), + }); + } + } + + match combined_mask { + Some(mask) => Ok(filter_record_batch(&batch, &mask)?), + None => Ok(batch), + } +} + type ConstantColumns = HashMap; /// Extract constant column values from statistics, keyed by column name in the logical file schema. @@ -1667,7 +2129,6 @@ mod test { metadata_size_hint: Option, metrics: ExecutionPlanMetricsSet, pushdown_filters: bool, - reorder_filters: bool, force_filter_selections: bool, enable_page_index: bool, enable_bloom_filter: bool, @@ -1693,7 +2154,6 @@ mod test { metadata_size_hint: None, metrics: ExecutionPlanMetricsSet::new(), pushdown_filters: false, - reorder_filters: false, force_filter_selections: false, enable_page_index: false, enable_bloom_filter: false, @@ -1741,12 +2201,6 @@ mod test { self } - /// Enable filter reordering. - fn with_reorder_filters(mut self, enable: bool) -> Self { - self.reorder_filters = enable; - self - } - /// Enable row group stats pruning. fn with_row_group_stats_pruning(mut self, enable: bool) -> Self { self.enable_row_group_stats_pruning = enable; @@ -1789,13 +2243,28 @@ mod test { ProjectionExprs::from_indices(&all_indices, &file_schema) }; + // Split the test-supplied AND-of-conjuncts predicate into the + // tagged-conjunct shape `ParquetMorselizer` now expects. Tests + // continue to pass a single `Arc` for + // ergonomics. + let predicate_conjuncts = self.predicate.as_ref().map(|p| { + datafusion_physical_expr::split_conjunction(p) + .into_iter() + .enumerate() + .map(|(id, expr)| (id, Arc::clone(expr))) + .collect::>() + }); + ParquetMorselizer { partition_index: self.partition_index, projection, batch_size: self.batch_size, limit: self.limit, preserve_order: self.preserve_order, - predicate: self.predicate, + predicate_conjuncts, + selectivity_tracker: Arc::new( + crate::selectivity::SelectivityTracker::default(), + ), table_schema, metadata_size_hint: self.metadata_size_hint, metrics: self.metrics, @@ -1803,7 +2272,6 @@ mod test { DefaultParquetFileReaderFactory::new(store), ), pushdown_filters: self.pushdown_filters, - reorder_filters: self.reorder_filters, force_filter_selections: self.force_filter_selections, enable_page_index: self.enable_page_index, enable_bloom_filter: self.enable_bloom_filter, @@ -2241,7 +2709,6 @@ mod test { .with_projection_indices(&[0]) .with_predicate(predicate) .with_pushdown_filters(true) // note that this is true! - .with_reorder_filters(true) .build() }; diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index c5c372055826b..54362416c1a72 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -65,6 +65,7 @@ //! - `WHERE s['value'] > 5` — pushed down (accesses a primitive leaf) //! - `WHERE s IS NOT NULL` — not pushed down (references the whole struct) +use log::debug; use std::collections::BTreeSet; use std::sync::Arc; @@ -81,10 +82,10 @@ use parquet::schema::types::SchemaDescriptor; use datafusion_common::Result; use datafusion_common::cast::as_boolean_array; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::ScalarFunctionExpr; use datafusion_physical_expr::expressions::{Column, Literal}; use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns}; -use datafusion_physical_expr::{PhysicalExpr, split_conjunction}; use datafusion_physical_plan::metrics; @@ -119,18 +120,49 @@ pub(crate) struct DatafusionArrowPredicate { rows_matched: metrics::Count, /// how long was spent evaluating this predicate time: metrics::Time, + /// Stable id used by the adaptive selectivity tracker to key per-filter + /// statistics across files. + filter_id: crate::selectivity::FilterId, + /// Shared handle to the adaptive selectivity tracker. Per-batch stats + /// are reported through `update()` after each `evaluate()` call. + tracker: Arc, + /// Estimated *late-materialization savings* per row for this filter: + /// the compressed bytes of projection columns that the filter does + /// NOT reference, amortised across the file's rows. When a pruned + /// row is dropped by the filter, these are the bytes the reader + /// avoids decoding further along the pipeline — the quantity the + /// adaptive tracker needs in order to rank filters by "cost avoided + /// per unit evaluation time". This MUST match the metric the + /// post-scan path reports in `apply_post_scan_filters_with_stats` + /// (see `opener.rs::post_scan_other_bytes_per_row`); if the two + /// paths disagreed, the tracker would rank row-filter and post-scan + /// candidates on incomparable axes and wrongly promote or demote them. + other_projected_bytes_per_row: f64, + /// Mid-stream "drop" flag, shared with the + /// [`crate::selectivity::SelectivityTracker`]. The tracker flips this + /// when an `OptionalFilterPhysicalExpr` proves CPU-dominated and + /// ineffective; once set, [`Self::evaluate`] returns an all-true mask + /// without invoking `physical_expr`. Filter columns are still decoded + /// (the parquet decoder cannot be reconfigured mid-scan), so this only + /// reclaims CPU, not I/O. Flagged only for filters known to be + /// optional, so correctness is preserved by the join itself. + skip_flag: Arc, } impl DatafusionArrowPredicate { - /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate` + /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate`. pub fn try_new( candidate: FilterCandidate, rows_pruned: metrics::Count, rows_matched: metrics::Count, time: metrics::Time, + filter_id: crate::selectivity::FilterId, + tracker: Arc, + other_projected_bytes_per_row: f64, ) -> Result { let physical_expr = reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?; + let skip_flag = tracker.skip_flag(filter_id); Ok(Self { physical_expr, @@ -138,6 +170,10 @@ impl DatafusionArrowPredicate { rows_pruned, rows_matched, time, + filter_id, + tracker, + other_projected_bytes_per_row, + skip_flag, }) } } @@ -148,10 +184,27 @@ impl ArrowPredicate for DatafusionArrowPredicate { } fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + // Mid-stream drop: the tracker has decided this optional filter is + // pulling its weight no longer. Return an all-true mask to bypass + // expression evaluation entirely. We still bump `rows_matched` so + // the per-predicate count stays consistent with input rows; the + // tracker is intentionally NOT updated for skipped batches because + // (a) we have nothing meaningful to report and (b) flooding it + // with zero-cost samples would mask the underlying effectiveness + // signal if the flag is ever cleared. + if self.skip_flag.load(std::sync::atomic::Ordering::Acquire) { + let rows_in_batch = batch.num_rows(); + self.rows_matched.add(rows_in_batch); + return Ok(BooleanArray::from(vec![true; rows_in_batch])); + } + // scoped timer updates on drop let mut timer = self.time.timer(); + let start_nanos = datafusion_common::instant::Instant::now(); - self.physical_expr + let rows_in_batch = batch.num_rows(); + let result = self + .physical_expr .evaluate(&batch) .and_then(|v| v.into_array(batch.num_rows())) .and_then(|array| { @@ -161,13 +214,42 @@ impl ArrowPredicate for DatafusionArrowPredicate { self.rows_pruned.add(num_pruned); self.rows_matched.add(num_matched); timer.stop(); - Ok(bool_arr) + Ok((bool_arr, num_matched)) }) .map_err(|e| { ArrowError::ComputeError(format!( "Error evaluating filter predicate: {e:?}" )) - }) + }); + + match result { + Ok((bool_arr, num_matched)) => { + let eval_nanos = start_nanos.elapsed().as_nanos() as u64; + // Scatter-aware skippable bytes: same units as the + // post-scan path (see `apply_post_scan_filters_with_stats`). + // At row-level this is a conservative *measurement* of + // what the decoder skipped — it counts only fully-empty + // sub-windows and ignores the additional savings from + // within-window RowSelection narrowing, which biases + // the demote-or-not decision in the safe direction. + let total_other_bytes = (rows_in_batch as f64 + * self.other_projected_bytes_per_row) + .round() as u64; + let skippable_bytes = crate::selectivity::count_skippable_bytes( + &bool_arr, + total_other_bytes, + ); + self.tracker.update( + self.filter_id, + num_matched as u64, + rows_in_batch as u64, + eval_nanos, + skippable_bytes, + ); + Ok(bool_arr) + } + Err(e) => Err(e), + } } } @@ -991,95 +1073,167 @@ fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result)>; + +/// Build row-level filters for the row-filter partition chosen by the +/// adaptive selectivity tracker. /// -/// # Arguments -/// * `expr` - The filter predicate, already adapted to reference columns in `file_schema` -/// * `file_schema` - The Arrow schema of the parquet file (the result of converting -/// the parquet schema to Arrow, potentially with type coercions applied) -/// * `metadata` - Parquet file metadata used for cost estimation -/// * `reorder_predicates` - If true, reorder predicates to minimize I/O -/// * `file_metrics` - Metrics for tracking filter performance +/// Each input filter keeps its stable filter id so the resulting +/// `ArrowPredicate`s can report per-batch statistics back to the tracker on +/// each `evaluate()` call, driving future promote/demote decisions. /// -/// # Returns -/// * `Ok(Some(row_filter))` if the expression can be used as a RowFilter -/// * `Ok(None)` if the expression cannot be used as a RowFilter -/// * `Err(e)` if an error occurs while building the filter +/// Filters that cannot be represented as an `ArrowPredicate` (e.g. whole +/// struct references or other unsupported patterns) are returned in the +/// second element of the returned tuple so the opener can apply them +/// post-scan instead of silently dropping them. /// -/// Note: The returned `RowFilter` may not contain all conjuncts from the original -/// expression. Conjuncts that cannot be evaluated as an `ArrowPredicate` are ignored. +/// # Arguments +/// * `filters` — The candidate filters paired with their stable ids. Assumed +/// to already be adapted to reference columns in `file_schema`. +/// * `file_schema` — The Arrow schema of the parquet file. +/// * `metadata` — Parquet file metadata used for cost estimation. +/// * `projection_compressed_bytes` — Total compressed bytes the user +/// projection reads across the file. Used to derive the per-filter +/// *late-materialization savings* reported to the tracker, so that +/// row-filter and post-scan candidates are ranked on a single common +/// axis. +/// * `tracker` — Shared adaptive selectivity tracker. +/// * `file_metrics` — Metrics for tracking filter performance. /// -/// For example, if the expression is `a = 1 AND b = 2 AND c = 3` and `b = 2` -/// cannot be evaluated for some reason, the returned `RowFilter` will contain -/// only `a = 1` and `c = 3`. +/// # Returns +/// * `Ok((Some(row_filter), unbuildable))` when at least one filter could be +/// represented as a row-level predicate. +/// * `Ok((None, unbuildable))` when no filters could be represented as row +/// filters; all are returned in `unbuildable`. +/// * `Err(e)` if an error occurs while building the filter. pub fn build_row_filter( - expr: &Arc, + filters: &[(crate::selectivity::FilterId, Arc)], file_schema: &SchemaRef, metadata: &ParquetMetaData, - reorder_predicates: bool, + projection_compressed_bytes: usize, + tracker: &Arc, file_metrics: &ParquetFileMetrics, -) -> Result> { +) -> Result<(Option, UnbuildableFilters)> { let rows_pruned = &file_metrics.pushdown_rows_pruned; let rows_matched = &file_metrics.pushdown_rows_matched; let time = &file_metrics.row_pushdown_eval_time; - // Split into conjuncts: - // `a = 1 AND b = 2 AND c = 3` -> [`a = 1`, `b = 2`, `c = 3`] - let predicates = split_conjunction(expr); - - // Determine which conjuncts can be evaluated as ArrowPredicates, if any - let mut candidates: Vec = predicates - .into_iter() - .map(|expr| { - FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema)) - .build(metadata) - }) - .collect::, _>>()? - .into_iter() - .flatten() - .collect(); - - // no candidates - if candidates.is_empty() { - return Ok(None); + // Total rows in the file, used to amortise compressed-byte totals + // over rows. We floor at 1 so empty files don't divide by zero; any + // effectiveness contribution in that degenerate case is irrelevant + // because there are no batches to track anyway. + let total_rows: i64 = metadata.row_groups().iter().map(|rg| rg.num_rows()).sum(); + let total_rows_f = total_rows.max(1) as f64; + + // Try to build a candidate for each filter independently. Any filter + // that can't be represented as an `ArrowPredicate`, for *any* reason + // (the candidate builder returned `None`, the builder returned an + // `Err`, or the `DatafusionArrowPredicate` constructor failed below), + // falls through into `unbuildable` so the caller can apply it + // post-scan. Silently dropping any conjunct here would relax the + // user's predicate and return wrong results — see the + // `post_scan_conjuncts` fallthrough in + // `ParquetOpener::build_stream`. + let mut buildable: Vec<(crate::selectivity::FilterId, FilterCandidate)> = + Vec::with_capacity(filters.len()); + let mut unbuildable: UnbuildableFilters = Vec::new(); + for (id, expr) in filters { + match FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema)) + .build(metadata) + { + Ok(Some(c)) => buildable.push((*id, c)), + Ok(None) => unbuildable.push((*id, Arc::clone(expr))), + Err(e) => { + debug!( + "failed to build row-filter candidate for {id}: {e}; falling through to post-scan" + ); + unbuildable.push((*id, Arc::clone(expr))); + } + } } - if reorder_predicates { - candidates.sort_unstable_by_key(|c| c.required_bytes); + if buildable.is_empty() { + return Ok((None, unbuildable)); } // To avoid double-counting metrics when multiple predicates are used: - // - All predicates should count rows_pruned (cumulative pruned rows) - // - Only the last predicate should count rows_matched (final result) - // This ensures: rows_matched + rows_pruned = total rows processed - let total_candidates = candidates.len(); - - candidates - .into_iter() - .enumerate() - .map(|(idx, candidate)| { - let is_last = idx == total_candidates - 1; - - // All predicates share the pruned counter (cumulative) - let predicate_rows_pruned = rows_pruned.clone(); - - // Only the last predicate tracks matched rows (final result) - let predicate_rows_matched = if is_last { - rows_matched.clone() - } else { - metrics::Count::new() - }; + // - All predicates share the cumulative rows_pruned counter + // - Only the last predicate writes to rows_matched (final pass count) + // This preserves the invariant: rows_matched + rows_pruned = total rows. + let total_candidates = buildable.len(); + + let mut predicates: Vec> = + Vec::with_capacity(total_candidates); + for (idx, (filter_id, candidate)) in buildable.into_iter().enumerate() { + let is_last = idx == total_candidates - 1; + let predicate_rows_pruned = rows_pruned.clone(); + let predicate_rows_matched = if is_last { + rows_matched.clone() + } else { + metrics::Count::new() + }; + // Late-materialization savings: bytes of the *non-filter* portion + // of the projection, per row. When the filter prunes a row, the + // decoder avoids decoding these bytes further downstream — that + // is the quantity the tracker needs as `batch_bytes` so its + // effectiveness metric (bytes-saved / eval-time) ranks filters + // by actual savings rather than by their own read cost. Match the + // post-scan path's formula in + // `opener.rs::post_scan_other_bytes_per_row`. + let other_projected_bytes_per_row = + projection_compressed_bytes.saturating_sub(candidate.required_bytes) as f64 + / total_rows_f; + // Remember the original expression before we move `candidate` into + // `try_new`, so that a failed predicate construction can fall back + // into `unbuildable` rather than being silently dropped. + let original_expr = Arc::clone(&candidate.expr); + match DatafusionArrowPredicate::try_new( + candidate, + predicate_rows_pruned, + predicate_rows_matched, + time.clone(), + filter_id, + Arc::clone(tracker), + other_projected_bytes_per_row, + ) { + Ok(pred) => predicates.push(Box::new(pred) as _), + Err(e) => { + debug!( + "failed to construct ArrowPredicate for filter {filter_id}: {e}; \ + falling through to post-scan" + ); + unbuildable.push((filter_id, original_expr)); + } + } + } - DatafusionArrowPredicate::try_new( - candidate, - predicate_rows_pruned, - predicate_rows_matched, - time.clone(), - ) - .map(|pred| Box::new(pred) as _) - }) - .collect::, _>>() - .map(|filters| Some(RowFilter::new(filters))) + if predicates.is_empty() { + Ok((None, unbuildable)) + } else { + Ok((Some(RowFilter::new(predicates)), unbuildable)) + } +} + +/// Estimate the total on-disk (compressed) byte cost of reading the given +/// leaf column indices across every row group in the file. Used by the +/// adaptive [`crate::selectivity::SelectivityTracker`] as a cheap proxy for +/// filter evaluation cost before runtime stats are available. +pub(crate) fn total_compressed_bytes( + column_indices: &[usize], + metadata: &ParquetMetaData, +) -> usize { + let mut total: i64 = 0; + for rg in metadata.row_groups() { + for &idx in column_indices { + if let Some(col) = rg.columns().get(idx) { + total += col.compressed_size(); + } + } + } + total.max(0) as usize } #[cfg(test)] @@ -1183,11 +1337,15 @@ mod test { .expect("building candidate") .expect("candidate expected"); + let test_tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, Count::new(), Count::new(), Time::new(), + 0, + Arc::clone(&test_tracker), + 0.0, ) .expect("creating filter predicate"); @@ -1222,11 +1380,15 @@ mod test { .expect("building candidate") .expect("candidate expected"); + let test_tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, Count::new(), Count::new(), Time::new(), + 0, + Arc::clone(&test_tracker), + 0.0, ) .expect("creating filter predicate"); @@ -1371,10 +1533,18 @@ mod test { let file_metrics = ParquetFileMetrics::new(0, &format!("{func_name}.parquet"), &metrics); - let row_filter = - build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics) - .expect("building row filter") - .expect("row filter should exist"); + let tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); + let filters = vec![(0usize, expr)]; + let (row_filter, _unbuildable) = build_row_filter( + &filters, + &file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + ) + .expect("building row filter"); + let row_filter = row_filter.expect("row filter should exist"); let reader = parquet_reader_builder .with_row_filter(row_filter) @@ -1949,10 +2119,18 @@ mod test { let metrics = ExecutionPlanMetricsSet::new(); let file_metrics = ParquetFileMetrics::new(0, "struct_e2e.parquet", &metrics); - let row_filter = - build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics) - .expect("building row filter") - .expect("row filter should exist"); + let tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); + let filters = vec![(0usize, expr)]; + let (row_filter, _unbuildable) = build_row_filter( + &filters, + &file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + ) + .expect("building row filter"); + let row_filter = row_filter.expect("row filter should exist"); let reader = parquet_reader_builder .with_row_filter(row_filter) diff --git a/datafusion/datasource-parquet/src/selectivity.rs b/datafusion/datasource-parquet/src/selectivity.rs new file mode 100644 index 0000000000000..20b958b581af6 --- /dev/null +++ b/datafusion/datasource-parquet/src/selectivity.rs @@ -0,0 +1,2326 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Adaptive filter selectivity tracking for Parquet row filters. +//! +//! See [`SelectivityTracker`] for the main entry point, `FilterState` for the +//! per-filter lifecycle, `PartitionedFilters` for the output consumed by +//! `ParquetOpener::open`, and [`FilterId`] for stable filter identification. + +use arrow::array::BooleanArray; +use log::debug; +use parking_lot::{Mutex, RwLock}; +use parquet::file::metadata::ParquetMetaData; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use datafusion_physical_expr::utils::collect_columns; +use datafusion_physical_expr_common::physical_expr::{ + OptionalFilterPhysicalExpr, PhysicalExpr, snapshot_generation, +}; + +/// Window size for the per-batch scatter analysis fed to +/// [`count_skippable_bytes`]. Approximates a parquet data page so that +/// "windows with zero survivors" tracks "pages a row-level decoder +/// could skip". Hardcoded for now; making this configurable (or +/// deriving it from per-row-group page metadata) is a natural follow-up. +pub(crate) const SKIP_WINDOW_ROWS: usize = 8192; + +/// Compute the bytes that late-materialization can plausibly skip for a +/// batch given the predicate output `bool_arr` and the total non-filter +/// projection bytes for that batch. +/// +/// Splits `bool_arr` into [`SKIP_WINDOW_ROWS`]-sized windows; each window +/// with zero survivors represents a page-sized chunk whose +/// other-projection columns the row-level decoder can skip outright. +/// Returns `total_other_bytes × (empty_windows / total_windows)` — +/// scatter-discounted skippable bytes. +/// +/// Interpretation depends on which side calls this: +/// +/// - **Post-scan path**: a *prediction* of bytes-saved-per-sec the +/// row-level path would achieve. The bool_arr we see is over the wide +/// batch in the same row order the decoder would emit, so for single- +/// predicate filters the prediction is faithful (modulo `W` matching +/// the actual parquet page size). +/// +/// - **Row-level path**: a conservative *measurement* of what the +/// decoder actually skipped — within-window RowSelection narrowing is +/// an additional uncounted bonus. So at row-level this is a *lower +/// bound* of real savings, which is the safe direction for the +/// demote-or-not decision. +pub(crate) fn count_skippable_bytes( + bool_arr: &BooleanArray, + total_other_bytes: u64, +) -> u64 { + let n = bool_arr.len(); + if n == 0 || total_other_bytes == 0 { + return 0; + } + // Short-circuit on the two extremes: avoids a redundant per-window + // SIMD scan over the same buffer when the answer is already + // determined by the batch-level total. The whole helper otherwise + // costs ~2× per-batch `true_count` for nothing. + let total_matched = bool_arr.true_count(); + if total_matched == 0 { + // Every window empty: full skippable. + return total_other_bytes; + } + if total_matched == n { + // No window empty: nothing skippable. + return 0; + } + let total_windows = n.div_ceil(SKIP_WINDOW_ROWS); + if total_windows == 1 { + // One-window batch with mixed matches → not skippable. Avoids + // a wasted slice+`true_count`. + return 0; + } + let mut empty_windows: u64 = 0; + for i in 0..total_windows { + let start = i * SKIP_WINDOW_ROWS; + let len = SKIP_WINDOW_ROWS.min(n - start); + if bool_arr.slice(start, len).true_count() == 0 { + empty_windows += 1; + } + } + ((total_other_bytes as f64 * empty_windows as f64) / total_windows as f64) as u64 +} + +/// Stable identifier for a filter conjunct, assigned by `ParquetSource::with_predicate`. +pub type FilterId = usize; + +/// Per-filter lifecycle state in the adaptive filter system. +/// +/// State transitions: +/// - **(unseen)** → [`RowFilter`](Self::RowFilter) or [`PostScan`](Self::PostScan) +/// on first encounter in [`SelectivityTracker::partition_filters`]. +/// - [`PostScan`](Self::PostScan) → [`RowFilter`](Self::RowFilter) when +/// effectiveness ≥ `min_bytes_per_sec` and enough rows have been observed. +/// - [`RowFilter`](Self::RowFilter) → [`PostScan`](Self::PostScan) when +/// effectiveness is below threshold (mandatory filter). +/// - [`RowFilter`](Self::RowFilter) → [`Dropped`](Self::Dropped) when +/// effectiveness is below threshold and the filter is optional +/// ([`OptionalFilterPhysicalExpr`]). +/// - [`RowFilter`](Self::RowFilter) → [`PostScan`](Self::PostScan)/[`Dropped`](Self::Dropped) +/// on periodic re-evaluation if effectiveness drops below threshold after +/// CI upper bound drops below threshold. +/// - **Any state** → re-evaluated when a dynamic filter's +/// `snapshot_generation` changes. +#[derive(Debug, Clone, Copy, PartialEq)] +pub(crate) enum FilterState { + /// Currently a row filter. + RowFilter, + /// Currently a post-scan filter. + PostScan, + /// Dropped entirely (insufficient throughput and optional). + Dropped, +} + +/// Result of partitioning filters into row filters vs post-scan. +/// +/// Produced by [`SelectivityTracker::partition_filters`], consumed by +/// `ParquetOpener::open` to build row-level predicates and post-scan filters. +/// +/// Filters are partitioned based on their effectiveness threshold. +/// +/// This type is `pub` to support the [selectivity tracker benchmark +/// harness](../../benches/selectivity_tracker.rs); treat the layout as +/// unstable from outside the crate. +#[derive(Debug, Clone, Default)] +#[doc(hidden)] +pub struct PartitionedFilters { + /// Filters promoted past collection — individual chained ArrowPredicates + pub row_filters: Vec<(FilterId, Arc)>, + /// Filters demoted to post-scan (fast path only) + pub post_scan: Vec<(FilterId, Arc)>, +} + +/// Tracks selectivity statistics for a single filter expression. +#[derive(Debug, Clone, Default, Copy, PartialEq)] +struct SelectivityStats { + /// Number of rows that matched (passed) the filter + rows_matched: u64, + /// Total number of rows evaluated + rows_total: u64, + /// Cumulative evaluation time in nanoseconds + eval_nanos: u64, + /// Cumulative bytes across batches this filter has been evaluated on + bytes_seen: u64, + /// Welford's online algorithm: number of per-batch effectiveness samples + sample_count: u64, + /// Welford's online algorithm: running mean of per-batch effectiveness + eff_mean: f64, + /// Welford's online algorithm: running sum of squared deviations (M2) + eff_m2: f64, + /// Whether the underlying expression is wrapped in + /// `OptionalFilterPhysicalExpr`. Cached here (rather than looked up + /// in [`SelectivityTracker::is_optional`]) so the per-batch hot path + /// in [`SelectivityTracker::update`] can skip the + /// SKIP_FLAG/CI-bound work entirely for non-optional filters with a + /// single field load on the already-held stats lock — no extra + /// HashMap or `RwLock::read()` per batch. + is_optional: bool, +} + +impl SelectivityStats { + /// Returns the cumulative effectiveness as an opaque ordering score + /// (higher = run first). + /// + /// Computed from `eff_mean` so it matches the Welford-tracked metric + /// fed to CI bounds: per-batch scatter-aware bytes-saved-per-second. + /// Callers should not assume the unit. + fn effectiveness(&self) -> Option { + if self.sample_count == 0 { + return None; + } + Some(self.eff_mean) + } + + /// Returns the lower bound of a confidence interval on mean effectiveness. + /// + /// Uses Welford's online variance to compute a one-sided CI: + /// `mean - z * stderr`. Returns `None` if fewer than 2 samples. + fn confidence_lower_bound(&self, confidence_z: f64) -> Option { + if self.sample_count < 2 { + return None; + } + let variance = self.eff_m2 / (self.sample_count - 1) as f64; + let stderr = (variance / self.sample_count as f64).sqrt(); + Some(self.eff_mean - confidence_z * stderr) + } + + /// Returns the upper bound of a confidence interval on mean effectiveness. + /// + /// Uses Welford's online variance: `mean + z * stderr`. + /// Returns `None` if fewer than 2 samples. + fn confidence_upper_bound(&self, confidence_z: f64) -> Option { + if self.sample_count < 2 { + return None; + } + let variance = self.eff_m2 / (self.sample_count - 1) as f64; + let stderr = (variance / self.sample_count as f64).sqrt(); + Some(self.eff_mean + confidence_z * stderr) + } + + /// Update stats with new observations. + /// + /// `skippable_bytes` is the caller's already-computed estimate of + /// non-filter projection bytes that late-materialization would + /// actually save for this batch — see [`count_skippable_bytes`] for + /// the windowed scatter calculation. The Welford accumulator tracks + /// `skippable_bytes × 1e9 / eval_nanos` (= scatter-aware + /// bytes-saved-per-second), which is what the promote/demote + /// gates compare against `min_bytes_per_sec`. + fn update( + &mut self, + matched: u64, + total: u64, + eval_nanos: u64, + skippable_bytes: u64, + ) { + self.rows_matched += matched; + self.rows_total += total; + self.eval_nanos += eval_nanos; + self.bytes_seen += skippable_bytes; + + if total > 0 && eval_nanos > 0 { + let batch_eff = skippable_bytes as f64 * 1e9 / eval_nanos as f64; + + self.sample_count += 1; + let delta = batch_eff - self.eff_mean; + self.eff_mean += delta / self.sample_count as f64; + let delta2 = batch_eff - self.eff_mean; + self.eff_m2 += delta * delta2; + } + } +} + +/// Immutable configuration for a [`SelectivityTracker`]. +/// +/// Use the builder methods to customise, then call [`build()`](TrackerConfig::build) +/// to produce a ready-to-use tracker. +#[doc(hidden)] +pub struct TrackerConfig { + /// Minimum bytes/sec throughput for promoting a filter (default: INFINITY = disabled). + pub min_bytes_per_sec: f64, + /// Byte-ratio threshold for initial filter placement (row-level vs post-scan). + /// Computed as `filter_compressed_bytes / projection_compressed_bytes`. + /// When low, the filter columns are small relative to the projection, + /// so row-level placement enables large late-materialization savings. + /// When high, the filter columns dominate the projection, so there's + /// little benefit from late materialization. + /// Default is 0.20. + pub byte_ratio_threshold: f64, + /// Z-score for confidence intervals on filter effectiveness. + /// Lower values (e.g. 1.0 or 0.0) will make the tracker more aggressive about promotion/demotion based on limited data. + /// Higher values (e.g. 3.0) will require more confidence before changing filter states. + /// Default is 2.0, corresponding to ~97.5% one-sided confidence. + /// Set to <= 0.0 to disable confidence intervals and promote/demote based on point estimates alone (not recommended). + /// Set to INFINITY to disable promotion entirely (overrides `min_bytes_per_sec`). + pub confidence_z: f64, +} + +impl TrackerConfig { + pub fn new() -> Self { + Self { + min_bytes_per_sec: f64::INFINITY, + byte_ratio_threshold: 0.20, + confidence_z: 2.0, + } + } + + pub fn with_min_bytes_per_sec(mut self, v: f64) -> Self { + self.min_bytes_per_sec = v; + self + } + + pub fn with_byte_ratio_threshold(mut self, v: f64) -> Self { + self.byte_ratio_threshold = v; + self + } + + pub fn with_confidence_z(mut self, v: f64) -> Self { + self.confidence_z = v; + self + } + + pub fn build(self) -> SelectivityTracker { + SelectivityTracker { + config: self, + filter_stats: RwLock::new(HashMap::new()), + skip_flags: RwLock::new(HashMap::new()), + inner: Mutex::new(SelectivityTrackerInner::new()), + } + } +} + +impl Default for TrackerConfig { + fn default() -> Self { + Self::new() + } +} + +/// Cross-file adaptive system that measures filter effectiveness and decides +/// which filters are promoted to row-level predicates (pushed into the Parquet +/// reader) vs. applied post-scan (demoted) or dropped entirely. +/// +/// # Locking design +/// +/// All locks are **private** to this struct — external callers cannot hold a +/// guard across expensive work, and all lock-holding code paths are auditable +/// in this file alone. +/// +/// State is split across two independent locks to minimise contention between +/// the hot per-batch `update()` path and the cold per-file-open +/// `partition_filters()` path: +/// +/// - **`filter_stats`** (`RwLock>>`) +/// — `update()` acquires a *shared read* lock on the outer map, then a +/// per-filter `Mutex` to increment counters. Multiple threads updating +/// *different* filters never contend at all; threads updating the *same* +/// filter serialize only on the cheap per-filter `Mutex` (~100 ns). +/// `partition_filters()` also takes a read lock here when it needs to +/// inspect stats for promotion/demotion decisions, so it never blocks +/// `update()` callers. The write lock is taken only briefly in Phase 2 +/// of `partition_filters()` to insert entries for newly-seen filter IDs. +/// +/// - **`inner`** (`Mutex`) — holds the filter +/// state-machine (`filter_states`) and dynamic-filter generation tracking. +/// Only `partition_filters()` acquires this lock (once per file open), so +/// concurrent `update()` calls are completely unaffected. +/// +/// ## Lock ordering (deadlock-free) +/// +/// Locks are always acquired in the order `inner` → `filter_stats` → +/// per-filter `Mutex`. Because `update()` never acquires `inner`, no +/// cycle is possible. +/// +/// ## Correctness of concurrent access +/// +/// `update()` may write stats while `partition_filters()` reads them for +/// promotion/demotion. Both hold a shared `filter_stats` read lock; the +/// per-filter `Mutex` ensures they do not interleave on the same filter's +/// stats. One proceeds first; the other sees a consistent (slightly newer +/// or older) snapshot. This is benign — the single-lock design that +/// preceded this split already allowed stats to change between consecutive +/// reads within `partition_filters()`. +/// +/// On promote/demote, `partition_filters()` zeros a filter's stats via the +/// per-filter `Mutex`. An `update()` running concurrently may write one +/// stale batch's worth of data to the freshly-zeroed stats; this is quickly +/// diluted by hundreds of correct-context batches and is functionally +/// identical to the old design where `update()` queued behind the write +/// lock and ran immediately after. +/// +/// # Filter state machine +/// +/// ```text +/// ┌─────────┐ +/// │ New │ +/// └─────────┘ +/// │ +/// ▼ +/// ┌────────────────────────┐ +/// │ Estimated Cost │ +/// │Bytes needed for filter │ +/// └────────────────────────┘ +/// │ +/// ┌──────────────────┴──────────────────┐ +/// ┌────────▼────────┐ ┌────────▼────────┐ +/// │ Post-scan │ │ Row filter │ +/// │ │ │ │ +/// └─────────────────┘ └─────────────────┘ +/// │ │ +/// ▼ ▼ +/// ┌─────────────────┐ ┌─────────────────┐ +/// │ Effectiveness │ │ Effectiveness │ +/// │ Bytes pruned │ │ Bytes pruned │ +/// │ per │ │ per │ +/// │Second of compute│ │Second of compute│ +/// └─────────────────┘ └─────────────────┘ +/// │ │ +/// └──────────────────┬──────────────────┘ +/// ▼ +/// ┌───────────────────────────────────────────────┐ +/// │ New Scan │ +/// │ Move filters based on effectiveness. │ +/// │ Promote (move post-scan -> row filter). │ +/// │ Demote (move row-filter -> post-scan). │ +/// │ Disable (for optional filters; either row │ +/// │ filter or disabled). │ +/// └───────────────────────────────────────────────┘ +/// │ +/// ┌──────────────────┴──────────────────┐ +/// ┌────────▼────────┐ ┌────────▼────────┐ +/// │ Post-scan │ │ Row filter │ +/// │ │ │ │ +/// └─────────────────┘ └─────────────────┘ +/// ``` +/// +/// See `TrackerConfig` for configuration knobs. +pub struct SelectivityTracker { + config: TrackerConfig, + /// Per-filter selectivity statistics, each individually `Mutex`-protected. + /// + /// The outer `RwLock` is almost always read-locked: both `update()` (hot, + /// per-batch) and `partition_filters()` (cold, per-file-open) only need + /// shared access to look up existing entries. The write lock is taken + /// only when `partition_filters()` inserts entries for newly-seen filter + /// IDs — a brief, infrequent operation. + /// + /// Each inner `Mutex` protects a single filter's + /// counters, so concurrent `update()` calls on *different* filters + /// proceed in parallel with zero contention. + filter_stats: RwLock>>, + /// Per-filter "skip" flags — when set, the corresponding filter is + /// treated as a no-op by both the row-filter + /// (`DatafusionArrowPredicate::evaluate`) and the post-scan path + /// (`apply_post_scan_filters_with_stats`). This is the mid-stream + /// equivalent of dropping an optional filter: once the per-batch + /// `update()` path proves an `OptionalFilterPhysicalExpr` is + /// CPU-dominated and ineffective, it flips the flag and subsequent + /// batches stop paying the evaluation cost. The decoder still decodes + /// the filter columns (we cannot rebuild it mid-scan), so I/O is not + /// reclaimed; only the predicate evaluation is skipped. + /// + /// Only ever set for filters whose `is_optional` flag (cached on the + /// per-filter [`SelectivityStats`]) is `true` — mandatory filters + /// must always execute or queries return wrong rows. + skip_flags: RwLock>>, + /// Filter lifecycle state machine and dynamic-filter generation tracking. + /// + /// Only `partition_filters()` acquires this lock (once per file open). + /// `update()` never touches it, so the hot per-batch path is completely + /// decoupled from the cold state-machine path. + inner: Mutex, +} + +impl std::fmt::Debug for SelectivityTracker { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SelectivityTracker") + .field("config.min_bytes_per_sec", &self.config.min_bytes_per_sec) + .finish() + } +} + +impl Default for SelectivityTracker { + fn default() -> Self { + Self::new() + } +} + +impl SelectivityTracker { + /// Create a new tracker with default settings (feature disabled). + pub fn new() -> Self { + TrackerConfig::new().build() + } + + /// Update stats for a filter after processing a batch. + /// + /// **Locking:** acquires `filter_stats.read()` (shared) then a per-filter + /// `Mutex`. Never touches `inner`, so this hot per-batch path cannot + /// contend with the cold per-file-open `partition_filters()` path. + /// + /// Silently skips unknown filter IDs (can occur if `update()` is called + /// before `partition_filters()` has registered the filter — in practice + /// this cannot happen because `partition_filters()` runs during file open + /// before any batches are processed). + /// + /// **Mid-stream drop:** after every `SKIP_FLAG_CHECK_INTERVAL`'th batch + /// we evaluate the CI upper bound; if it falls below + /// `min_bytes_per_sec` and the filter is wrapped in + /// `OptionalFilterPhysicalExpr`, we set the per-filter skip flag. + /// Subsequent calls to `DatafusionArrowPredicate::evaluate` (row-level) + /// and `apply_post_scan_filters_with_stats` (post-scan) observe the + /// flag and short-circuit their work for that filter. Mandatory + /// filters are never flagged because doing so would change the result + /// set. + #[doc(hidden)] + pub fn update( + &self, + id: FilterId, + matched: u64, + total: u64, + eval_nanos: u64, + batch_bytes: u64, + ) { + let stats_map = self.filter_stats.read(); + let Some(entry) = stats_map.get(&id) else { + return; + }; + let mut stats = entry.lock(); + stats.update(matched, total, eval_nanos, batch_bytes); + + // Fast path for non-optional filters: nothing else to do. The + // SKIP_FLAG mid-stream drop only applies to + // `OptionalFilterPhysicalExpr`-wrapped filters (hash-join / + // TopK dynamic), and `is_optional` is cached inline on + // `SelectivityStats` at filter registration so this is a single + // field load on the already-held lock. + if !stats.is_optional { + return; + } + + // Optional filter: do the SKIP_FLAG check every batch — there's + // no SKIP_FLAG_CHECK_INTERVAL gate here on purpose. We want + // join/TopK skip flags to fire as soon as stats prove the + // filter's selectivity has collapsed, even mid-row-group. The + // CI-bound calc is cheap arithmetic on already-locked stats. + if !self.config.min_bytes_per_sec.is_finite() { + return; + } + let Some(ub) = stats.confidence_upper_bound(self.config.confidence_z) else { + return; + }; + if ub >= self.config.min_bytes_per_sec { + return; + } + drop(stats); + drop(stats_map); + + if let Some(flag) = self.skip_flags.read().get(&id) + && !flag.swap(true, Ordering::Release) + { + debug!( + "FilterId {id}: mid-stream skip — CI upper bound {ub} < {} bytes/sec", + self.config.min_bytes_per_sec + ); + } + } + + /// Returns the shared skip flag for `id`, creating one if absent. + /// + /// Cloned into [`crate::row_filter::DatafusionArrowPredicate`] so the + /// row-filter path can short-circuit when the per-batch update path + /// decides the filter has stopped pulling its weight. The post-scan + /// path uses [`Self::is_filter_skipped`] instead — it does not need a + /// long-lived handle. + pub(crate) fn skip_flag(&self, id: FilterId) -> Arc { + if let Some(existing) = self.skip_flags.read().get(&id) { + return Arc::clone(existing); + } + let mut write = self.skip_flags.write(); + Arc::clone( + write + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))), + ) + } + + /// Returns `true` when `id` has been mid-stream-dropped by the tracker. + /// + /// Cheap: a single `RwLock::read` plus an atomic load. Called from the + /// post-scan filter loop in `apply_post_scan_filters_with_stats`. + pub(crate) fn is_filter_skipped(&self, id: FilterId) -> bool { + self.skip_flags + .read() + .get(&id) + .is_some_and(|f| f.load(Ordering::Acquire)) + } + + /// Partition filters into row-level predicates vs post-scan filters. + /// + /// Called once per file open (cold path). + /// + /// **Locking — two phases:** + /// 1. Acquires `inner` (exclusive) and `filter_stats` (shared read) for + /// all decision logic — promotion, demotion, initial placement, and + /// sorting by effectiveness. Because `filter_stats` is only + /// read-locked, concurrent `update()` calls proceed unblocked. + /// 2. If new filter IDs were seen, briefly acquires `filter_stats` (write) + /// to insert per-filter `Mutex` entries so that future `update()` calls + /// can find them. + #[doc(hidden)] + pub fn partition_filters( + &self, + filters: Vec<(FilterId, Arc)>, + projection_columns: &std::collections::HashSet, + projection_scan_size: usize, + metadata: &ParquetMetaData, + ) -> PartitionedFilters { + // Phase 1: inner.lock() + filter_stats.read() → all decision logic + let mut guard = self.inner.lock(); + let stats_map = self.filter_stats.read(); + let result = guard.partition_filters( + filters, + projection_columns, + projection_scan_size, + metadata, + &self.config, + &stats_map, + ); + drop(stats_map); + drop(guard); + + // Phase 2: if new filters were seen, briefly acquire write locks + // to insert per-filter `Mutex` (with + // `is_optional` cached inline so the per-batch `update()` hot + // path can fast-return for mandatory filters) and an + // `AtomicBool` skip-flag (only consulted for optional filters). + if !result.new_optional_flags.is_empty() { + let mut stats_write = self.filter_stats.write(); + let mut skip_write = self.skip_flags.write(); + for (id, is_optional) in result.new_optional_flags { + stats_write.entry(id).or_insert_with(|| { + Mutex::new(SelectivityStats { + is_optional, + ..Default::default() + }) + }); + skip_write + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))); + } + } + + result.partitioned + } + + /// Test helper: ensure a stats entry exists for the given filter ID. + /// In production, `partition_filters()` inserts entries for new filters. + /// Tests that call `update()` without prior `partition_filters()` need this. + #[cfg(test)] + fn ensure_stats_entry(&self, id: FilterId) { + let map = self.filter_stats.read(); + if map.get(&id).is_none() { + drop(map); + self.filter_stats + .write() + .entry(id) + .or_insert_with(|| Mutex::new(SelectivityStats::default())); + } + } +} + +/// Internal result from [`SelectivityTrackerInner::partition_filters`]. +/// +/// Carries both the partitioned filters and the `(FilterId, is_optional)` +/// entries seen for the first time, so the outer +/// [`SelectivityTracker::partition_filters`] can insert per-filter +/// `Mutex` entries (with `is_optional` cached inline) +/// in a brief Phase 2 write lock. +struct PartitionResult { + partitioned: PartitionedFilters, + /// `(FilterId, is_optional)` entries observed for the first time in + /// this `partition_filters` call. + new_optional_flags: Vec<(FilterId, bool)>, +} + +/// Filter state-machine and generation tracking, guarded by the `Mutex` +/// inside [`SelectivityTracker`]. +/// +/// This struct intentionally does **not** contain per-filter stats — those +/// live in the separate `filter_stats` lock so that the hot `update()` path +/// can modify stats without acquiring this lock. Only the cold +/// `partition_filters()` path (once per file open) needs this lock. +#[derive(Debug)] +struct SelectivityTrackerInner { + /// Per-filter lifecycle state (RowFilter / PostScan / Dropped). + filter_states: HashMap, + /// Last-seen snapshot generation per filter, for detecting when a dynamic + /// filter's selectivity has changed (e.g. hash-join build side grew). + snapshot_generations: HashMap, +} + +impl SelectivityTrackerInner { + fn new() -> Self { + Self { + filter_states: HashMap::new(), + snapshot_generations: HashMap::new(), + } + } + + /// Check and update the snapshot generation for a filter. + fn note_generation( + &mut self, + id: FilterId, + generation: u64, + stats_map: &HashMap>, + ) { + if generation == 0 { + return; + } + match self.snapshot_generations.get(&id) { + Some(&prev_generation) if prev_generation == generation => {} + Some(_) => { + let current_state = self.filter_states.get(&id).copied(); + // Always reset stats since selectivity changed with new generation. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + self.snapshot_generations.insert(id, generation); + + // Optional/dynamic filters only get more selective over time + // (hash join build side accumulates more values). So if the + // filter was already working (RowFilter or PostScan), preserve + // its state. Only un-drop Dropped filters back to PostScan + // so they get another chance with the new selectivity. + if current_state == Some(FilterState::Dropped) { + debug!("FilterId {id} generation changed, un-dropping to PostScan"); + self.filter_states.insert(id, FilterState::PostScan); + } else { + debug!( + "FilterId {id} generation changed, resetting stats but preserving state {current_state:?}" + ); + } + } + None => { + self.snapshot_generations.insert(id, generation); + } + } + } + + /// Get the effectiveness for a filter by ID. + fn get_effectiveness_by_id( + &self, + id: FilterId, + stats_map: &HashMap>, + ) -> Option { + stats_map + .get(&id) + .and_then(|entry| entry.lock().effectiveness()) + } + + /// Demote a filter to post-scan or drop it entirely if optional. + fn demote_or_drop( + &mut self, + id: FilterId, + expr: &Arc, + post_scan: &mut Vec<(FilterId, Arc)>, + stats_map: &HashMap>, + ) { + if expr.downcast_ref::().is_none() { + self.filter_states.insert(id, FilterState::PostScan); + post_scan.push((id, Arc::clone(expr))); + // Reset stats for this filter so it can be re-evaluated as a post-scan filter. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + } else { + self.filter_states.insert(id, FilterState::Dropped); + } + } + + /// Promote a filter to row-level. + fn promote( + &mut self, + id: FilterId, + expr: Arc, + row_filters: &mut Vec<(FilterId, Arc)>, + stats_map: &HashMap>, + ) { + row_filters.push((id, expr)); + self.filter_states.insert(id, FilterState::RowFilter); + // Reset stats for this filter since it will be evaluated at row-level now. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + } + + /// Partition filters into collecting / promoted / post-scan buckets. + fn partition_filters( + &mut self, + filters: Vec<(FilterId, Arc)>, + projection_columns: &std::collections::HashSet, + projection_scan_size: usize, + metadata: &ParquetMetaData, + config: &TrackerConfig, + stats_map: &HashMap>, + ) -> PartitionResult { + let mut new_optional_flags: Vec<(FilterId, bool)> = Vec::new(); + + // If min_bytes_per_sec is INFINITY -> all filters are post-scan. + if config.min_bytes_per_sec.is_infinite() { + debug!( + "Filter promotion disabled via min_bytes_per_sec=INFINITY; all {} filters post-scan", + filters.len() + ); + // Register all filter IDs so update() can find them + for (id, expr) in &filters { + if !stats_map.contains_key(id) { + new_optional_flags.push((*id, is_optional_filter(expr))); + } + } + return PartitionResult { + partitioned: PartitionedFilters { + row_filters: Vec::new(), + post_scan: filters, + }, + new_optional_flags, + }; + } + // If min_bytes_per_sec is 0 -> all filters are promoted. + if config.min_bytes_per_sec == 0.0 { + debug!( + "All filters promoted via min_bytes_per_sec=0; all {} filters row-level", + filters.len() + ); + // Register all filter IDs so update() can find them + for (id, expr) in &filters { + if !stats_map.contains_key(id) { + new_optional_flags.push((*id, is_optional_filter(expr))); + } + } + return PartitionResult { + partitioned: PartitionedFilters { + row_filters: filters, + post_scan: Vec::new(), + }, + new_optional_flags, + }; + } + + // Note snapshot generations for dynamic filter detection. + // This clears stats for any filter whose generation has changed since the last scan. + // This must be done before any other logic since it can change filter states and stats. + for &(id, ref expr) in &filters { + let generation = snapshot_generation(expr); + self.note_generation(id, generation, stats_map); + } + + // Separate into row filters and post-scan filters based on effectiveness and state. + let mut row_filters: Vec<(FilterId, Arc)> = Vec::new(); + let mut post_scan_filters: Vec<(FilterId, Arc)> = Vec::new(); + + let confidence_z = config.confidence_z; + for (id, expr) in filters { + let state = self.filter_states.get(&id).copied(); + + let Some(state) = state else { + // New filter: decide initial placement. + // + // We start at row-level only when the filter pulls in a + // small amount of *extra* I/O — bytes for filter columns + // **not already in the user projection** — relative to the + // projection. These are the cases where the row-level + // I/O cost is bounded and late materialization on a + // selective filter is a clear win (think a small int + // column predicate against a heavy string projection). + // + // Two cases default to post-scan instead, with the + // tracker free to promote later if measured + // bytes-saved-per-sec exceeds `min_bytes_per_sec`: + // + // - `extra_bytes == 0`: filter cols are entirely in the + // projection (e.g. `WHERE col <> '' GROUP BY col`). + // There's no I/O to save; the only payoff is late + // materialization on the *non*-filter projection + // columns, which depends on selectivity we don't know + // yet. Empirically (ClickBench Q10/11/13/14/26) + // defaulting these to row-level loses to post-scan + // because predicate-cache eviction on heavy string + // columns means the filter column is decoded twice. + // + // - `byte_ratio > byte_ratio_threshold`: extra I/O is + // too high to justify before we have evidence the + // filter is selective. + // + // Pre-existing snapshot-generation handling + // ([`SelectivityTrackerInner::note_generation`]) keeps + // dynamic filters (hash-join, TopK) at post-scan when + // they re-arm with new values — those rely on row-group + // statistics pruning rather than row-level I/O savings, + // so post-scan is correct for them too. + let filter_columns: Vec = collect_columns(&expr) + .iter() + .map(|col| col.index()) + .collect(); + let extra_columns: Vec = filter_columns + .iter() + .copied() + .filter(|c| !projection_columns.contains(c)) + .collect(); + let extra_bytes = + crate::row_filter::total_compressed_bytes(&extra_columns, metadata); + let byte_ratio = if projection_scan_size > 0 { + extra_bytes as f64 / projection_scan_size as f64 + } else { + 1.0 + }; + + if !stats_map.contains_key(&id) { + new_optional_flags.push((id, is_optional_filter(&expr))); + } + + let row_level = + extra_bytes > 0 && byte_ratio <= config.byte_ratio_threshold; + if row_level { + debug!( + "FilterId {id}: New filter → Row filter (byte_ratio {byte_ratio:.4} <= {}, extra_bytes={extra_bytes}) — {expr}", + config.byte_ratio_threshold + ); + self.filter_states.insert(id, FilterState::RowFilter); + row_filters.push((id, expr)); + } else { + debug!( + "FilterId {id}: New filter → Post-scan (byte_ratio {byte_ratio:.4}, extra_bytes={extra_bytes}) — {expr}", + ); + self.filter_states.insert(id, FilterState::PostScan); + post_scan_filters.push((id, expr)); + } + continue; + }; + + match state { + FilterState::RowFilter => { + // Should we demote this filter based on CI upper bound? + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(ub) = stats.confidence_upper_bound(confidence_z) + && ub < config.min_bytes_per_sec + { + drop(stats); + debug!( + "FilterId {id}: Row filter → Post-scan via CI upper bound {ub} < {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.demote_or_drop( + id, + &expr, + &mut post_scan_filters, + stats_map, + ); + continue; + } + } + // If not demoted, keep as row filter. + row_filters.push((id, expr)); + } + FilterState::PostScan => { + // Single gate: scatter-aware CI lower bound on + // bytes-saved-per-sec ≥ `min_bytes_per_sec`. + // + // The metric (see [`SelectivityStats::update`]) + // counts only sub-batch windows the filter empties + // out, so a 50% uniform filter scores ~0 and stays + // at post-scan; a TopK / hash-join / `Title LIKE` + // style filter where most batches drop entirely + // blows past the threshold. + // + // Earlier revisions also required `prune_rate ≥ 99%` + // on the theory that arrow-rs's row-level path + // double-decoded heavy string columns when the + // filter and projection overlapped. EXPLAIN ANALYZE + // on the ClickBench Q23 workload (URL LIKE + // `%google%`) showed the predicate cache is in fact + // active (`predicate_cache_inner_records=8.76M`) + // and the filter column is decoded once. The gate + // was removed; the residual ClickBench regressions + // we attributed to it (Q26 / Q31) trace to a + // different cause: post-scan filtering inside the + // opener changes batch-arrival order at downstream + // TopK, shifting the convergence point of TopK's + // dynamic filter and slightly weakening file-stats + // pruning. That has nothing to do with the + // promotion decision. + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(lb) = stats.confidence_lower_bound(confidence_z) + && lb >= config.min_bytes_per_sec + { + drop(stats); + debug!( + "FilterId {id}: Post-scan → Row filter via CI lower bound {lb} >= {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.promote(id, expr, &mut row_filters, stats_map); + continue; + } + } + // Should we drop this filter if it's optional and ineffective? + // Non-optional filters must stay as post-scan regardless. + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(ub) = stats.confidence_upper_bound(confidence_z) + && ub < config.min_bytes_per_sec + && expr.downcast_ref::().is_some() + { + drop(stats); + debug!( + "FilterId {id}: Post-scan → Dropped via CI upper bound {ub} < {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.filter_states.insert(id, FilterState::Dropped); + continue; + } + } + // Keep as post-scan filter (don't reset stats for mandatory filters). + post_scan_filters.push((id, expr)); + } + FilterState::Dropped => continue, + } + } + + // Sort row filters by: + // - Effectiveness (descending, higher = better) if available for both filters. + // - Scan size (ascending, cheapest first) as fallback — cheap filters prune + // rows before expensive ones, reducing downstream evaluation cost. + let cmp_row_filters = + |(id_a, expr_a): &(FilterId, Arc), + (id_b, expr_b): &(FilterId, Arc)| { + let eff_a = self.get_effectiveness_by_id(*id_a, stats_map); + let eff_b = self.get_effectiveness_by_id(*id_b, stats_map); + if let (Some(eff_a), Some(eff_b)) = (eff_a, eff_b) { + eff_b + .partial_cmp(&eff_a) + .unwrap_or(std::cmp::Ordering::Equal) + } else { + let size_a = filter_scan_size(expr_a, metadata); + let size_b = filter_scan_size(expr_b, metadata); + size_a.cmp(&size_b) + } + }; + row_filters.sort_by(cmp_row_filters); + // Post-scan filters: same logic (cheaper post-scan filters first to reduce + // the batch size for subsequent filters). + post_scan_filters.sort_by(cmp_row_filters); + + debug!( + "Partitioned filters: {} row-level, {} post-scan", + row_filters.len(), + post_scan_filters.len() + ); + PartitionResult { + partitioned: PartitionedFilters { + row_filters, + post_scan: post_scan_filters, + }, + new_optional_flags, + } + } +} + +/// Returns `true` if `expr` is wrapped in [`OptionalFilterPhysicalExpr`]. +fn is_optional_filter(expr: &Arc) -> bool { + expr.downcast_ref::().is_some() +} + +/// Calculate the estimated number of bytes needed to evaluate a filter based on the columns +/// it references as if it were applied to the entire file. +/// This is used for initial placement of new filters before any stats are available, and as a fallback for filters without stats. +fn filter_scan_size(expr: &Arc, metadata: &ParquetMetaData) -> usize { + let columns: Vec = collect_columns(expr) + .iter() + .map(|col| col.index()) + .collect(); + + crate::row_filter::total_compressed_bytes(&columns, metadata) +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_physical_expr::expressions::Column; + use parquet::basic::Type as PhysicalType; + use parquet::file::metadata::{ColumnChunkMetaData, FileMetaData, RowGroupMetaData}; + use parquet::schema::types::SchemaDescPtr; + use parquet::schema::types::Type as SchemaType; + use std::sync::Arc; + + mod helper_functions { + use super::*; + + /// Creates test ParquetMetaData with specified row groups and column sizes. + /// + /// # Arguments + /// * `specs` - Vec of (num_rows, vec![compressed_size]) tuples for each row group + pub fn create_test_metadata(specs: Vec<(i64, Vec)>) -> ParquetMetaData { + // Get the maximum number of columns from all specs + let num_columns = specs + .iter() + .map(|(_, sizes)| sizes.len()) + .max() + .unwrap_or(1); + let schema_descr = get_test_schema_descr_with_columns(num_columns); + + let row_group_metadata: Vec<_> = specs + .into_iter() + .map(|(num_rows, column_sizes)| { + let columns = column_sizes + .into_iter() + .enumerate() + .map(|(col_idx, size)| { + ColumnChunkMetaData::builder(schema_descr.column(col_idx)) + .set_num_values(num_rows) + .set_total_compressed_size(size as i64) + .build() + .unwrap() + }) + .collect(); + + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_column_metadata(columns) + .build() + .unwrap() + }) + .collect(); + + let total_rows: i64 = row_group_metadata.iter().map(|rg| rg.num_rows()).sum(); + let file_metadata = + FileMetaData::new(1, total_rows, None, None, schema_descr.clone(), None); + + ParquetMetaData::new(file_metadata, row_group_metadata) + } + + /// Creates a simple column expression with given name and index. + pub fn col_expr(name: &str, index: usize) -> Arc { + Arc::new(Column::new(name, index)) + } + + /// Create schema with specified number of columns, each named "a", "b", etc. + pub fn get_test_schema_descr_with_columns(num_columns: usize) -> SchemaDescPtr { + use parquet::basic::LogicalType; + + let fields: Vec<_> = (0..num_columns) + .map(|i| { + let col_name = format!("{}", (b'a' + i as u8) as char); + SchemaType::primitive_type_builder( + &col_name, + PhysicalType::BYTE_ARRAY, + ) + .with_logical_type(Some(LogicalType::String)) + .build() + .unwrap() + }) + .map(Arc::new) + .collect(); + + let schema = SchemaType::group_type_builder("schema") + .with_fields(fields) + .build() + .unwrap(); + Arc::new(parquet::schema::types::SchemaDescriptor::new(Arc::new( + schema, + ))) + } + } + + mod selectivity_stats_tests { + use super::*; + + #[test] + fn test_effectiveness_basic_calculation() { + let mut stats = SelectivityStats::default(); + + // 100 rows total, 50 rows pruned (matched 50), 1 sec eval time, 10000 bytes seen + // bytes_per_row = 10000 / 100 = 100 + // bytes_saved = 50 * 100 = 5000 + // effectiveness = 5000 * 1e9 / 1e9 = 5000 + stats.update(50, 100, 1_000_000_000, 10_000); + + let eff = stats.effectiveness().unwrap(); + assert!((eff - 5000.0).abs() < 0.1); + } + + #[test] + fn test_effectiveness_zero_rows_total() { + let mut stats = SelectivityStats::default(); + stats.update(0, 0, 1_000_000_000, 10_000); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_zero_eval_nanos() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 0, 10_000); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_zero_bytes_seen() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 1_000_000_000, 0); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_all_rows_matched() { + let mut stats = SelectivityStats::default(); + // All rows matched (no pruning) + stats.update(100, 100, 1_000_000_000, 10_000); + + let eff = stats.effectiveness().unwrap(); + assert_eq!(eff, 0.0); + } + + #[test] + fn test_confidence_bounds_single_sample() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 1_000_000_000, 10_000); + + // Single sample returns None for confidence bounds + assert_eq!(stats.confidence_lower_bound(2.0), None); + assert_eq!(stats.confidence_upper_bound(2.0), None); + } + + #[test] + fn test_welford_identical_samples() { + let mut stats = SelectivityStats::default(); + + // Add two identical samples + stats.update(50, 100, 1_000_000_000, 10_000); + stats.update(50, 100, 1_000_000_000, 10_000); + + // Variance should be 0 + assert_eq!(stats.sample_count, 2); + let lb = stats.confidence_lower_bound(2.0).unwrap(); + let ub = stats.confidence_upper_bound(2.0).unwrap(); + + // Both should be equal to the mean since variance is 0 + assert!((lb - ub).abs() < 0.01); + } + + #[test] + fn test_welford_variance_calculation() { + let mut stats = SelectivityStats::default(); + + // Add samples that will produce effectiveness values of ~100, ~200, ~300 + // These are constructed to give those exact effectiveness values + stats.update(50, 100, 1_000_000_000, 10_000); // eff ≈ 5000 + stats.update(40, 100, 1_000_000_000, 10_000); // eff ≈ 6000 + stats.update(30, 100, 1_000_000_000, 10_000); // eff ≈ 7000 + + // We should have 3 samples + assert_eq!(stats.sample_count, 3); + + // Mean should be 6000 + assert!((stats.eff_mean - 6000.0).abs() < 1.0); + + // Both bounds should be defined + let lb = stats.confidence_lower_bound(1.0).unwrap(); + let ub = stats.confidence_upper_bound(1.0).unwrap(); + + assert!(lb < stats.eff_mean); + assert!(ub > stats.eff_mean); + } + + #[test] + fn test_confidence_bounds_asymmetry() { + let mut stats = SelectivityStats::default(); + + stats.update(50, 100, 1_000_000_000, 10_000); + stats.update(40, 100, 1_000_000_000, 10_000); + + let lb = stats.confidence_lower_bound(2.0).unwrap(); + let ub = stats.confidence_upper_bound(2.0).unwrap(); + + // Bounds should be symmetric around the mean + let lower_dist = stats.eff_mean - lb; + let upper_dist = ub - stats.eff_mean; + + assert!((lower_dist - upper_dist).abs() < 0.01); + } + + #[test] + fn test_welford_incremental_vs_batch() { + // Create two identical stats objects + let mut stats_incremental = SelectivityStats::default(); + let mut stats_batch = SelectivityStats::default(); + + // Incremental: add one at a time + stats_incremental.update(50, 100, 1_000_000_000, 10_000); + stats_incremental.update(40, 100, 1_000_000_000, 10_000); + stats_incremental.update(30, 100, 1_000_000_000, 10_000); + + // Batch: simulate batch update (all at once) + stats_batch.update(120, 300, 3_000_000_000, 30_000); + + // Both should produce the same overall statistics + assert_eq!(stats_incremental.rows_total, stats_batch.rows_total); + assert_eq!(stats_incremental.rows_matched, stats_batch.rows_matched); + + // Means should be close + assert!((stats_incremental.eff_mean - stats_batch.eff_mean).abs() < 100.0); + } + + #[test] + fn test_effectiveness_numerical_stability() { + let mut stats = SelectivityStats::default(); + + // Test with large values to ensure numerical stability + stats.update( + 500_000_000, + 1_000_000_000, + 10_000_000_000_000, + 1_000_000_000_000, + ); + + let eff = stats.effectiveness(); + assert!(eff.is_some()); + assert!(eff.unwrap() > 0.0); + assert!(!eff.unwrap().is_nan()); + assert!(!eff.unwrap().is_infinite()); + } + } + + mod tracker_config_tests { + use super::*; + + #[test] + fn test_default_config() { + let config = TrackerConfig::default(); + + assert!(config.min_bytes_per_sec.is_infinite()); + assert_eq!(config.byte_ratio_threshold, 0.20); + assert_eq!(config.confidence_z, 2.0); + } + + #[test] + fn test_with_min_bytes_per_sec() { + let config = TrackerConfig::new().with_min_bytes_per_sec(1000.0); + + assert_eq!(config.min_bytes_per_sec, 1000.0); + } + + #[test] + fn test_with_byte_ratio_threshold() { + let config = TrackerConfig::new().with_byte_ratio_threshold(0.5); + + assert_eq!(config.byte_ratio_threshold, 0.5); + } + + #[test] + fn test_with_confidence_z() { + let config = TrackerConfig::new().with_confidence_z(3.0); + + assert_eq!(config.confidence_z, 3.0); + } + + #[test] + fn test_builder_chain() { + let config = TrackerConfig::new() + .with_min_bytes_per_sec(500.0) + .with_byte_ratio_threshold(0.3) + .with_confidence_z(1.5); + + assert_eq!(config.min_bytes_per_sec, 500.0); + assert_eq!(config.byte_ratio_threshold, 0.3); + assert_eq!(config.confidence_z, 1.5); + } + + #[test] + fn test_build_creates_tracker() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Tracker should be created and functional + assert_eq!(tracker.config.min_bytes_per_sec, 1000.0); + } + } + + mod state_machine_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_initial_placement_low_byte_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.2) + .build(); + + // Create metadata: 1 row group, 100 rows, 1000 bytes for column + let metadata = create_test_metadata(vec![(100, vec![1000])]); + + // Filter using column 0 (1000 bytes out of 1000 projection = 100% ratio > 0.2) + // So this should be placed in post-scan initially + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // With 100% byte ratio, should go to post-scan + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_initial_placement_filter_in_projection_low_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Create metadata: 1 row group, 100 rows, 100 bytes for column + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // Filter using column 0 which IS in the projection. + // filter_bytes=100, projection=1000, ratio=0.10 <= 0.5 → RowFilter + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_initial_placement_high_byte_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Create metadata: 1 row group, 100 rows, 100 bytes for column + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // Filter using column 0 (100 bytes / 1000 projection = 10% ratio <= 0.5) + // So this should be placed in row-filter immediately + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // With 10% byte ratio, should go to row-filter + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_min_bytes_per_sec_infinity_disables_promotion() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(f64::INFINITY) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // All filters should go to post_scan when min_bytes_per_sec is INFINITY + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_min_bytes_per_sec_zero_promotes_all() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // All filters should be promoted to row_filters when min_bytes_per_sec is 0 + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_promotion_via_confidence_lower_bound() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) // Force to PostScan initially + .with_confidence_z(0.5) // Lower z for easier promotion + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // First partition: goes to PostScan (high byte ratio) + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + + // Feed high effectiveness stats + for _ in 0..5 { + tracker.update(1, 1, 100, 100_000, 1000); // high effectiveness + } + + // Second partition: should be promoted to RowFilter + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_demotion_via_confidence_upper_bound() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) // Force to RowFilter initially + .with_confidence_z(0.5) // Lower z for easier demotion + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // First partition: goes to RowFilter (low byte ratio) + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Feed low effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // all rows matched, no pruning + } + + // Second partition: should be demoted to PostScan + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_demotion_resets_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as RowFilter + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Add stats + tracker.update(1, 100, 100, 100_000, 1000); + tracker.update(1, 100, 100, 100_000, 1000); + + // Demote + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Stats should be zeroed after demotion + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + } + + #[test] + fn test_promotion_resets_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(100.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as PostScan + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Add stats with high prune_rate so the selectivity gate + // (>= 0.99) lets the promotion fire. + for _ in 0..3 { + tracker.update(1, 1, 100, 100_000, 1000); + } + + // Promote + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Stats should be zeroed after promotion + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + } + + #[test] + fn test_optional_filter_dropping() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as PostScan + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Feed poor effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // no pruning + } + + // Next partition: should stay as PostScan (not dropped because not optional) + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + } + + #[test] + fn test_persistent_dropped_state() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Mark filter as dropped by manually setting state + tracker + .inner + .lock() + .filter_states + .insert(1, FilterState::Dropped); + + // On next partition, dropped filters should not reappear + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 0); + } + } + + mod filter_ordering_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_filters_get_partitioned() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1.0) // Very low threshold + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100, 100, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // Partition should process all filters + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // With min_bytes_per_sec=1.0, filters should be partitioned + assert!(result.row_filters.len() + result.post_scan.len() > 0); + + // Add stats and partition again + tracker.update(1, 60, 100, 1_000_000, 100); + tracker.update(2, 1, 100, 1_000_000, 100); + tracker.update(3, 40, 100, 1_000_000, 100); + + let result2 = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Filters should still be partitioned + assert!(result2.row_filters.len() + result2.post_scan.len() > 0); + } + + #[test] + fn test_filters_processed_without_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1.0) // Very low threshold + .build(); + + // Different column sizes: 300, 200, 100 bytes + let metadata = create_test_metadata(vec![(100, vec![300, 200, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // First partition - no stats yet + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // All filters should be processed (partitioned into row/post-scan) + assert!(result.row_filters.len() + result.post_scan.len() > 0); + + // Filters should be consistent on repeated calls + let result2 = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!( + result.row_filters.len() + result.post_scan.len(), + result2.row_filters.len() + result2.post_scan.len() + ); + } + + #[test] + fn test_filters_with_partial_stats() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1.0).build(); + + // Give filter 2 larger bytes so it's prioritized when falling back to byte ratio + let metadata = create_test_metadata(vec![(100, vec![100, 300, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // First partition + let result1 = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert!(result1.row_filters.len() + result1.post_scan.len() > 0); + + // Only add stats for filters 1 and 3, not 2 + tracker.update(1, 60, 100, 1_000_000, 100); + tracker.update(3, 60, 100, 1_000_000, 100); + + // Second partition with partial stats + let result2 = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert!(result2.row_filters.len() + result2.post_scan.len() > 0); + } + + #[test] + fn test_ordering_stability_with_identical_values() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![100, 100, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + let result1 = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + let result2 = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // Without stats and with identical byte sizes, order should be stable + assert_eq!(result1.row_filters[0].0, result2.row_filters[0].0); + assert_eq!(result1.row_filters[1].0, result2.row_filters[1].0); + assert_eq!(result1.row_filters[2].0, result2.row_filters[2].0); + } + } + + mod dynamic_filter_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_generation_zero_ignored() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + + // Create two filters with same ID but generation 0 and 1 + // Generation 0 should be ignored + let expr1 = col_expr("a", 0); + let filters1 = vec![(1, expr1)]; + + tracker.partition_filters( + filters1, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + tracker.update(1, 50, 100, 100_000, 1000); + + // Generation 0 doesn't trigger state reset + let snapshot_gen = tracker.inner.lock().snapshot_generations.get(&1).copied(); + assert_eq!(snapshot_gen, None); + } + + #[test] + fn test_generation_change_clears_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Pre-populate stats entry so update() can find it + tracker.ensure_stats_entry(1); + + // Initialize generation to 100 + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Add stats + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(1, 50, 100, 100_000, 1000); + + let stats_before = { + let stats_map = tracker.filter_stats.read(); + *stats_map.get(&1).unwrap().lock() != SelectivityStats::default() + }; + assert!(stats_before); + + // Simulate generation change to a different value + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Stats should be zeroed on generation change + let stats_after = { + let stats_map = tracker.filter_stats.read(); + *stats_map.get(&1).unwrap().lock() == SelectivityStats::default() + }; + assert!(stats_after); + } + + #[test] + fn test_generation_unchanged_preserves_stats() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Pre-populate stats entry so update() can find it + tracker.ensure_stats_entry(1); + + // Manually set generation + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Add stats + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(1, 50, 100, 100_000, 1000); + + let sample_count_before = { + let stats_map = tracker.filter_stats.read(); + stats_map.get(&1).map(|s| s.lock().sample_count) + }; + assert_eq!(sample_count_before, Some(2)); + + // Call note_generation with same generation + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Stats should be preserved + let sample_count_after = { + let stats_map = tracker.filter_stats.read(); + stats_map.get(&1).map(|s| s.lock().sample_count) + }; + assert_eq!(sample_count_after, Some(2)); + } + + #[test] + fn test_generation_change_preserves_state() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.1) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // First partition: goes to RowFilter + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + let state_before = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_before, Some(FilterState::RowFilter)); + + // Simulate generation change + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // State should be preserved despite stats being cleared + let state_after = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_after, Some(FilterState::RowFilter)); + } + + #[test] + fn test_generation_change_undrops_dropped_filter() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.1) + .build(); + + // Manually set filter state to Dropped + tracker + .inner + .lock() + .filter_states + .insert(1, FilterState::Dropped); + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Simulate generation change + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Dropped filter should be un-dropped to PostScan + let state_after = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_after, Some(FilterState::PostScan)); + } + + #[test] + fn test_multiple_filters_independent_generation_tracking() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Pre-populate stats entries so update() can find them + tracker.ensure_stats_entry(1); + tracker.ensure_stats_entry(2); + + // Set generations for multiple filters + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + inner.note_generation(2, 200, &stats); + } + + // Add stats to both + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(2, 50, 100, 100_000, 1000); + + // Change generation of filter 1 only + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Filter 1 stats should be zeroed, filter 2 preserved + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + assert_ne!( + *stats_map.get(&2).unwrap().lock(), + SelectivityStats::default() + ); + } + } + + mod integration_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_full_promotion_lifecycle() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(500.0) + .with_byte_ratio_threshold(0.5) // Force initial PostScan + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Step 1: Initial placement (PostScan) + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + + // Step 2: Accumulate high effectiveness stats + for _ in 0..5 { + tracker.update(1, 1, 100, 100_000, 1000); // high effectiveness + } + + // Step 3: Promotion should occur + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Step 4: Continue to partition without additional updates + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_full_demotion_lifecycle() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) // Force initial RowFilter + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Step 1: Initial placement (RowFilter) + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Step 2: Accumulate low effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // no pruning + } + + // Step 3: Demotion should occur + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + + // Step 4: Continue to partition without additional updates + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_multiple_filters_mixed_states() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.4) // Force PostScan initially (500/1000=0.5 > 0.4) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![500, 500])]); + let filters = vec![(1, col_expr("a", 0)), (2, col_expr("a", 1))]; + + // Initial partition: both go to PostScan (500/1000 = 0.5 > 0.4) + let result = tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.post_scan.len(), 2); + + // Filter 1: high effectiveness (promote) + for _ in 0..3 { + tracker.update(1, 1, 100, 100_000, 500); + } + + // Filter 2: low effectiveness (stay PostScan) + for _ in 0..3 { + tracker.update(2, 100, 100, 100_000, 500); + } + + // Next partition: Filter 1 promoted, Filter 2 stays PostScan + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters[0].0, 1); + assert_eq!(result.post_scan[0].0, 2); + } + + #[test] + fn test_empty_filter_list() { + let tracker = TrackerConfig::new().build(); + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let filters = vec![]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_single_filter() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_zero_effectiveness_stays_at_boundary() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(100.0) + .with_byte_ratio_threshold(0.1) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as RowFilter + tracker.partition_filters( + filters.clone(), + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + + // All rows match (zero effectiveness) + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 100); + } + + // Should demote due to CI upper bound being 0 + let result = tracker.partition_filters( + filters, + &std::collections::HashSet::new(), + 1000, + &metadata, + ); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_confidence_z_parameter_stored() { + // Test that different confidence_z values are properly stored in config + let tracker_conservative = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(3.0) // Harder to promote + .build(); + + let tracker_aggressive = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) // Easier to promote + .build(); + + // Verify configs are stored correctly + assert_eq!(tracker_conservative.config.confidence_z, 3.0); + assert_eq!(tracker_aggressive.config.confidence_z, 0.5); + + // The z-score affects confidence intervals during promotion/demotion decisions. + // With identical stats, higher z requires narrower confidence intervals, + // making promotion harder. With lower z, confidence intervals are wider, + // making promotion easier. This is tested in other integration tests + // that verify actual promotion/demotion behavior. + } + } +} diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index a014c8b2726e7..ccdf5e664b8e9 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -39,8 +39,9 @@ use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_datasource::TableSchema; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_expr::conjunction; use datafusion_physical_expr::projection::ProjectionExprs; -use datafusion_physical_expr::{EquivalenceProperties, conjunction}; use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -277,8 +278,10 @@ pub struct ParquetSource { /// In particular, this is the schema of the table without partition columns, /// *not* the physical schema of the file. pub(crate) table_schema: TableSchema, - /// Optional predicate for row filtering during parquet scan - pub(crate) predicate: Option>, + /// Optional predicate conjuncts for row filtering during parquet scan. + /// Each conjunct is tagged with a stable FilterId for selectivity tracking. + pub(crate) predicate_conjuncts: + Option)>>, /// Optional user defined parquet file reader factory pub(crate) parquet_file_reader_factory: Option>, /// Batch size configuration @@ -294,6 +297,10 @@ pub struct ParquetSource { /// so we still need to sort them after reading, so the reverse scan is inexact. /// Used to optimize ORDER BY ... DESC on sorted data. reverse_row_groups: bool, + /// Tracks filter selectivity across files for adaptive filter reordering. + /// Shared across all openers - each opener reads stats and makes its own + /// decision about which filters to push down vs. apply post-scan. + pub(crate) selectivity_tracker: Arc, } impl ParquetSource { @@ -312,13 +319,16 @@ impl ParquetSource { table_schema, table_parquet_options: TableParquetOptions::default(), metrics: ExecutionPlanMetricsSet::new(), - predicate: None, + predicate_conjuncts: None, parquet_file_reader_factory: None, batch_size: None, metadata_size_hint: None, #[cfg(feature = "parquet_encryption")] encryption_factory: None, reverse_row_groups: false, + selectivity_tracker: Arc::new( + crate::selectivity::SelectivityTracker::default(), + ), } } @@ -327,6 +337,15 @@ impl ParquetSource { mut self, table_parquet_options: TableParquetOptions, ) -> Self { + // Update the selectivity tracker from the config + let opts = &table_parquet_options.global; + self.selectivity_tracker = Arc::new( + crate::selectivity::TrackerConfig::new() + .with_min_bytes_per_sec(opts.filter_pushdown_min_bytes_per_sec) + .with_byte_ratio_threshold(opts.filter_collecting_byte_ratio_threshold) + .with_confidence_z(opts.filter_confidence_z) + .build(), + ); self.table_parquet_options = table_parquet_options; self } @@ -342,11 +361,23 @@ impl ParquetSource { self } - /// Set predicate information + /// Set predicate information. + /// + /// The predicate is split into conjuncts and each is assigned a stable + /// `FilterId` (its index in the conjunct list). These IDs are used for + /// selectivity tracking across files, avoiding ExprKey mismatch issues + /// when expressions are rebased or simplified per-file. #[expect(clippy::needless_pass_by_value)] pub fn with_predicate(&self, predicate: Arc) -> Self { + use datafusion_physical_expr::split_conjunction; let mut conf = self.clone(); - conf.predicate = Some(Arc::clone(&predicate)); + let conjuncts: Vec<(crate::selectivity::FilterId, Arc)> = + split_conjunction(&predicate) + .into_iter() + .enumerate() + .map(|(id, expr)| (id, Arc::clone(expr))) + .collect(); + conf.predicate_conjuncts = Some(conjuncts); conf } @@ -367,8 +398,15 @@ impl ParquetSource { /// Optional predicate. #[deprecated(since = "50.2.0", note = "use `filter` instead")] - pub fn predicate(&self) -> Option<&Arc> { - self.predicate.as_ref() + pub fn predicate(&self) -> Option> { + self.combined_predicate() + } + + /// Build a combined predicate from the conjuncts, if any. + fn combined_predicate(&self) -> Option> { + self.predicate_conjuncts + .as_ref() + .map(|conjuncts| conjunction(conjuncts.iter().map(|(_, e)| Arc::clone(e)))) } /// return the optional file reader factory @@ -399,19 +437,30 @@ impl ParquetSource { self.table_parquet_options.global.pushdown_filters } - /// If true, the `RowFilter` made by `pushdown_filters` may try to - /// minimize the cost of filter evaluation by reordering the - /// predicate [`Expr`]s. If false, the predicates are applied in - /// the same order as specified in the query. Defaults to false. + /// Set the legacy `reorder_filters` config flag. /// - /// [`Expr`]: datafusion_expr::Expr + /// The adaptive selectivity tracker subsumes static filter reordering + /// (filters are now ranked online by measured bytes-saved/sec), so the + /// flag is preserved for backwards compatibility but no longer has any + /// effect on filter placement. + #[deprecated( + since = "53.2.0", + note = "the adaptive selectivity tracker reorders filters by measured \ + effectiveness; this flag is now a no-op" + )] pub fn with_reorder_filters(mut self, reorder_filters: bool) -> Self { self.table_parquet_options.global.reorder_filters = reorder_filters; self } - /// Return the value described in [`Self::with_reorder_filters`] - fn reorder_filters(&self) -> bool { + /// Return the value of the legacy `reorder_filters` config flag. Now a + /// no-op — see [`Self::with_reorder_filters`]. + #[deprecated( + since = "53.2.0", + note = "the adaptive selectivity tracker reorders filters by measured \ + effectiveness; this flag is now a no-op" + )] + pub fn reorder_filters(&self) -> bool { self.table_parquet_options.global.reorder_filters } @@ -561,13 +610,13 @@ impl FileSource for ParquetSource { .expect("Batch size must set before creating ParquetMorselizer"), limit: base_config.limit, preserve_order: base_config.preserve_order, - predicate: self.predicate.clone(), + predicate_conjuncts: self.predicate_conjuncts.clone(), + selectivity_tracker: Arc::clone(&self.selectivity_tracker), table_schema: self.table_schema.clone(), metadata_size_hint: self.metadata_size_hint, metrics: self.metrics().clone(), parquet_file_reader_factory, pushdown_filters: self.pushdown_filters(), - reorder_filters: self.reorder_filters(), force_filter_selections: self.force_filter_selections(), enable_page_index: self.enable_page_index(), enable_bloom_filter: self.bloom_filter_on_read(), @@ -588,7 +637,7 @@ impl FileSource for ParquetSource { } fn filter(&self) -> Option> { - self.predicate.clone() + self.combined_predicate() } fn with_batch_size(&self, batch_size: usize) -> Arc { @@ -641,7 +690,7 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead, we use the logical schema of the file (the table schema without partition columns). - if let Some(predicate) = &self.predicate { + if let Some(predicate) = &self.combined_predicate() { let predicate_creation_errors = Count::new(); if let Some(pruning_predicate) = build_pruning_predicates( Some(predicate), @@ -718,13 +767,16 @@ impl FileSource for ParquetSource { PushedDown::No => None, }) .collect_vec(); - let predicate = match source.predicate { - Some(predicate) => { - conjunction(std::iter::once(predicate).chain(allowed_filters)) - } - None => conjunction(allowed_filters), - }; - source.predicate = Some(predicate); + // Merge existing conjuncts with new allowed filters + let mut all_conjuncts: Vec> = source + .predicate_conjuncts + .as_ref() + .map(|c| c.iter().map(|(_, e)| Arc::clone(e)).collect()) + .unwrap_or_default(); + all_conjuncts.extend(allowed_filters); + // Re-assign FilterIds by index + source.predicate_conjuncts = + Some(all_conjuncts.into_iter().enumerate().collect()); source = source.with_pushdown_filters(pushdown_filters); let source = Arc::new(source); // If pushdown_filters is false we tell our parents that they still have to handle the filters, @@ -835,8 +887,10 @@ impl FileSource for ParquetSource { ) -> datafusion_common::Result { // Visit predicate (filter) expression if present let mut tnr = TreeNodeRecursion::Continue; - if let Some(predicate) = &self.predicate { - tnr = tnr.visit_sibling(|| f(predicate.as_ref()))?; + if let Some(ref conjuncts) = self.predicate_conjuncts { + for (_, expr) in conjuncts { + tnr = tnr.visit_sibling(|| f(expr.as_ref()))?; + } } // Visit projection expressions @@ -861,8 +915,9 @@ mod tests { let parquet_source = ParquetSource::new(Arc::new(Schema::empty())).with_predicate(predicate); - // same value. but filter() call Arc::clone internally - assert_eq!(parquet_source.predicate(), parquet_source.filter().as_ref()); + // Both should return equivalent predicates + assert!(parquet_source.predicate().is_some()); + assert!(parquet_source.filter().is_some()); } #[test] diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index a42a1560cb769..7b1c84a060590 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -687,6 +687,161 @@ pub fn is_volatile(expr: &Arc) -> bool { is_volatile } +/// A transparent wrapper that marks a [`PhysicalExpr`] as *optional* — i.e., +/// droppable without affecting query correctness. +/// +/// This is used for filters that are performance hints (e.g., dynamic join +/// filters) as opposed to mandatory predicates. The selectivity tracker can +/// detect this wrapper via `expr.as_any().downcast_ref::()` +/// and choose to drop the filter entirely when it is not cost-effective. +/// +/// All [`PhysicalExpr`] methods are delegated to the wrapped inner expression. +/// +/// Currently used by `HashJoinExec` for dynamic join filters. When the +/// selectivity tracker drops such a filter, the join still enforces +/// correctness independently — "dropped" simply means the filter is never +/// applied as a scan-time optimization. +#[derive(Debug)] +pub struct OptionalFilterPhysicalExpr { + inner: Arc, +} + +impl OptionalFilterPhysicalExpr { + /// Create a new optional filter wrapping the given expression. + pub fn new(inner: Arc) -> Self { + Self { inner } + } + + /// Returns a clone of the inner (unwrapped) expression. + pub fn inner(&self) -> Arc { + Arc::clone(&self.inner) + } +} + +impl Display for OptionalFilterPhysicalExpr { + /// Pass through to the inner expression. Surfacing the `Optional(..)` + /// wrapper in plan output would require updating dozens of sqllogictest + /// baselines for what is purely a runtime concept (the adaptive + /// scheduler's permission to drop this filter); plan readers don't need + /// to see it. + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.inner) + } +} + +impl PartialEq for OptionalFilterPhysicalExpr { + fn eq(&self, other: &Self) -> bool { + self.inner.as_ref() == other.inner.as_ref() + } +} + +impl Eq for OptionalFilterPhysicalExpr {} + +impl Hash for OptionalFilterPhysicalExpr { + fn hash(&self, state: &mut H) { + self.inner.as_ref().hash(state); + } +} + +impl PhysicalExpr for OptionalFilterPhysicalExpr { + fn data_type(&self, input_schema: &Schema) -> Result { + self.inner.data_type(input_schema) + } + + fn nullable(&self, input_schema: &Schema) -> Result { + self.inner.nullable(input_schema) + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + self.inner.evaluate(batch) + } + + fn return_field(&self, input_schema: &Schema) -> Result { + self.inner.return_field(input_schema) + } + + fn evaluate_selection( + &self, + batch: &RecordBatch, + selection: &BooleanArray, + ) -> Result { + self.inner.evaluate_selection(batch, selection) + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.inner] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + assert_eq_or_internal_err!( + children.len(), + 1, + "OptionalFilterPhysicalExpr: expected 1 child" + ); + Ok(Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone( + &children[0], + )))) + } + + fn evaluate_bounds(&self, children: &[&Interval]) -> Result { + self.inner.evaluate_bounds(children) + } + + fn propagate_constraints( + &self, + interval: &Interval, + children: &[&Interval], + ) -> Result>> { + self.inner.propagate_constraints(interval, children) + } + + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + self.inner.evaluate_statistics(children) + } + + fn propagate_statistics( + &self, + parent: &Distribution, + children: &[&Distribution], + ) -> Result>> { + self.inner.propagate_statistics(parent, children) + } + + fn get_properties(&self, children: &[ExprProperties]) -> Result { + self.inner.get_properties(children) + } + + fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.inner.fmt_sql(f) + } + + fn snapshot(&self) -> Result>> { + // Always unwrap the Optional wrapper for snapshot consumers (e.g. PruningPredicate). + // If inner has a snapshot, use it; otherwise return the inner directly. + Ok(Some(match self.inner.snapshot()? { + Some(snap) => snap, + None => Arc::clone(&self.inner), + })) + } + + fn snapshot_generation(&self) -> u64 { + // The wrapper itself is not dynamic; tree-walking picks up + // inner's generation via children(). + 0 + } + + fn is_volatile_node(&self) -> bool { + self.inner.is_volatile_node() + } + + fn placement(&self) -> ExpressionPlacement { + self.inner.placement() + } +} + #[cfg(test)] mod test { use crate::physical_expr::PhysicalExpr; @@ -694,6 +849,7 @@ mod test { use arrow::datatypes::{DataType, Schema}; use datafusion_expr_common::columnar_value::ColumnarValue; use std::fmt::{Display, Formatter}; + use std::hash::{Hash, Hasher}; use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Hash)] @@ -868,4 +1024,104 @@ mod test { &BooleanArray::from(vec![true; 5]), ); } + + #[test] + fn test_optional_filter_downcast() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone(&inner))); + + // Can downcast to detect the wrapper + let as_physical: Arc = optional; + assert!( + as_physical + .downcast_ref::() + .is_some() + ); + + // Inner expr is NOT detectable as optional + assert!(inner.downcast_ref::().is_none()); + } + + #[test] + fn test_optional_filter_delegates_evaluate() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(Arc::clone(&inner)); + + let batch = + unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 5) }; + let result = optional.evaluate(&batch).unwrap(); + let array = result.to_array(5).unwrap(); + assert_eq!(array.len(), 5); + } + + #[test] + fn test_optional_filter_children_and_with_new_children() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone(&inner))); + + // children() returns the inner + let children = optional.children(); + assert_eq!(children.len(), 1); + + // with_new_children preserves the wrapper + let new_inner: Arc = Arc::new(TestExpr {}); + let rewrapped = Arc::clone(&optional) + .with_new_children(vec![new_inner]) + .unwrap(); + assert!( + rewrapped + .downcast_ref::() + .is_some() + ); + } + + #[test] + fn test_optional_filter_inner() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(Arc::clone(&inner)); + + // inner() returns a clone of the wrapped expression + let unwrapped = optional.inner(); + assert!(unwrapped.downcast_ref::().is_some()); + } + + #[test] + fn test_optional_filter_snapshot_generation_zero() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(inner); + + assert_eq!(optional.snapshot_generation(), 0); + } + + #[test] + fn test_optional_filter_eq_hash() { + use super::OptionalFilterPhysicalExpr; + use std::collections::hash_map::DefaultHasher; + + let inner1: Arc = Arc::new(TestExpr {}); + let inner2: Arc = Arc::new(TestExpr {}); + + let opt1 = OptionalFilterPhysicalExpr::new(inner1); + let opt2 = OptionalFilterPhysicalExpr::new(inner2); + + // Same inner type → equal + assert_eq!(opt1, opt2); + + // Same hash + let mut h1 = DefaultHasher::new(); + let mut h2 = DefaultHasher::new(); + opt1.hash(&mut h1); + opt2.hash(&mut h2); + assert_eq!(h1.finish(), h2.finish()); + } } diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index ef2a8c18470c4..905227e0085ac 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -1127,6 +1127,14 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt { protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize), }).unwrap_or(None), + // Adaptive selectivity tracker knobs are not yet plumbed + // through the proto schema; restore their config-side defaults + // so a proto round-trip preserves the default behavior. + filter_pushdown_min_bytes_per_sec: ParquetOptions::default() + .filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: ParquetOptions::default() + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: ParquetOptions::default().filter_confidence_z, use_content_defined_chunking: value.content_defined_chunking.map(|cdc| { let defaults = CdcOptions::default(); CdcOptions { diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index a050f5fba2061..3e1ec47fac169 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -556,6 +556,13 @@ mod parquet { max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt { parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize, }), + // Adaptive selectivity tracker knobs are not yet plumbed + // through the proto schema; fall back to config-side defaults. + filter_pushdown_min_bytes_per_sec: ParquetOptions::default() + .filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: ParquetOptions::default() + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: ParquetOptions::default().filter_confidence_z, use_content_defined_chunking: proto.content_defined_chunking.map(|cdc| { let defaults = CdcOptions::default(); CdcOptions { diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b04c78bd2774c..71ef46518e029 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -245,6 +245,9 @@ datafusion.execution.parquet.dictionary_enabled true datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL +datafusion.execution.parquet.filter_collecting_byte_ratio_threshold 0.2 +datafusion.execution.parquet.filter_confidence_z 2 +datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec 104857600 datafusion.execution.parquet.force_filter_selections false datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 @@ -392,6 +395,9 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.filter_collecting_byte_ratio_threshold 0.2 (reading) Initial-placement heuristic for adaptive filters: when a filter is first observed, place it at row-level if its column bytes are this fraction or less of the total projection's column bytes. Above this ratio, the filter starts as post-scan and only gets promoted later if measured throughput crosses `filter_pushdown_min_bytes_per_sec`. +datafusion.execution.parquet.filter_confidence_z 2 (reading) Z-score for the one-sided confidence interval the adaptive filter scheduler uses when promoting / demoting / dropping filters. Default `2.0` (≈ 97.5%) keeps strategy moves conservative; lower the value for snappier adaptation, raise it for more stable placements. +datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec 104857600 (reading) Minimum throughput, in bytes per second, that an adaptive row-level filter must sustain to remain at row-level. Filters that drop below this threshold (with statistical confidence — see `filter_confidence_z`) are demoted to post-scan, or dropped entirely if they were optional (e.g. a hash-join build-side dynamic filter). Set to `0` to force every filter to row-level (skip the threshold check); set to `f64::INFINITY` to keep every filter post-scan. datafusion.execution.parquet.force_filter_selections false (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt index 8469c32a17033..b0ef922cde097 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt @@ -24,6 +24,13 @@ set datafusion.explain.physical_plan_only = true; statement ok set datafusion.execution.parquet.pushdown_filters = true; +# Force every conjunct onto the row-filter path so the +# `pushdown_rows_*` and `predicate_cache_*` metrics this file asserts on are +# populated. The default of 100 MB/s keeps small test files post-scan, +# where the row-filter metrics are zero. +statement ok +set datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec = 0; + # this one is also required to make DF skip second file due to "sufficient" amount of rows statement ok set datafusion.execution.collect_statistics = true; @@ -1030,5 +1037,8 @@ RESET datafusion.explain.physical_plan_only; statement ok RESET datafusion.execution.parquet.pushdown_filters; +statement ok +RESET datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec; + statement ok drop table t; diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 46039f3c99c27..7ce716bd75258 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -92,6 +92,9 @@ The following configuration settings are available: | datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | | datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | | datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | +| datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec | 104857600 | (reading) Minimum throughput, in bytes per second, that an adaptive row-level filter must sustain to remain at row-level. Filters that drop below this threshold (with statistical confidence — see `filter_confidence_z`) are demoted to post-scan, or dropped entirely if they were optional (e.g. a hash-join build-side dynamic filter). Set to `0` to force every filter to row-level (skip the threshold check); set to `f64::INFINITY` to keep every filter post-scan. | +| datafusion.execution.parquet.filter_collecting_byte_ratio_threshold | 0.2 | (reading) Initial-placement heuristic for adaptive filters: when a filter is first observed, place it at row-level if its column bytes are this fraction or less of the total projection's column bytes. Above this ratio, the filter starts as post-scan and only gets promoted later if measured throughput crosses `filter_pushdown_min_bytes_per_sec`. | +| datafusion.execution.parquet.filter_confidence_z | 2 | (reading) Z-score for the one-sided confidence interval the adaptive filter scheduler uses when promoting / demoting / dropping filters. Default `2.0` (≈ 97.5%) keeps strategy moves conservative; lower the value for snappier adaptation, raise it for more stable placements. | | datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" |