diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index d64fba6c22113..b3c1e10584a0e 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install cargo-audit - uses: taiki-e/install-action@85b24a67ef0c632dfefad70b9d5ce8fddb040754 # v2.75.10 + uses: taiki-e/install-action@055f5df8c3f65ea01cd41e9dc855becd88953486 # v2.75.18 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 920e1e79c8540..70d38b28112de 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -45,11 +45,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 with: category: "/language:actions" diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 89bd77670c12d..7713d5dd31422 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -41,7 +41,7 @@ jobs: path: asf-site - name: Setup uv - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 - name: Install dependencies run: uv sync --package datafusion-docs diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml index 5abf9a119d2f5..dab81fd6452da 100644 --- a/.github/workflows/docs_pr.yaml +++ b/.github/workflows/docs_pr.yaml @@ -45,7 +45,7 @@ jobs: submodules: true fetch-depth: 1 - name: Setup uv - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 - name: Install doc dependencies run: uv sync --package datafusion-docs - name: Install dependency graph tooling diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b176535616546..5ef886c66f0ef 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -430,7 +430,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@85b24a67ef0c632dfefad70b9d5ce8fddb040754 # v2.75.10 + uses: taiki-e/install-action@055f5df8c3f65ea01cd41e9dc855becd88953486 # v2.75.18 with: tool: wasm-pack - name: Run tests with headless mode @@ -770,7 +770,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@85b24a67ef0c632dfefad70b9d5ce8fddb040754 # v2.75.10 + uses: taiki-e/install-action@055f5df8c3f65ea01cd41e9dc855becd88953486 # v2.75.18 with: tool: cargo-msrv diff --git a/Cargo.lock b/Cargo.lock index 37b18c3048f30..eaac23828b72e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -221,7 +221,7 @@ dependencies = [ "arrow-select", "arrow-string", "half", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] @@ -272,7 +272,7 @@ dependencies = [ "flate2", "indexmap 2.14.0", "liblzma", - "rand 0.9.2", + "rand 0.9.4", "serde", "serde_json", "snap", @@ -596,9 +596,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.15" +version = "1.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" +checksum = "50f156acdd2cf55f5aa53ee416c4ac851cf1222694506c0b1f78c85695e9ca9d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -638,9 +638,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.16.2" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" dependencies = [ "aws-lc-sys", "zeroize", @@ -648,9 +648,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.39.0" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" dependencies = [ "cc", "cmake", @@ -660,9 +660,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" +checksum = "5dcd93c82209ac7413532388067dce79be5a8780c1786e5fae3df22e4dee2864" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -685,9 +685,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.97.0" +version = "1.98.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" +checksum = "d69c77aafa20460c68b6b3213c84f6423b6e76dbf89accd3e1789a686ffd9489" dependencies = [ "aws-credential-types", "aws-runtime", @@ -709,9 +709,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.99.0" +version = "1.100.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" +checksum = "1c7e7b09346d5ca22a2a08267555843a6a0127fb20d8964cb6ecfb8fdb190225" dependencies = [ "aws-credential-types", "aws-runtime", @@ -733,9 +733,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.101.0" +version = "1.103.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab41ad64e4051ecabeea802d6a17845a91e83287e1dd249e6963ea1ba78c428a" +checksum = "c2249b81a2e73a8027c41c378463a81ec39b8510f184f2caab87de912af0f49b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -758,9 +758,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.4.2" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" +checksum = "68dc0b907359b120170613b5c09ccc61304eac3998ff6274b97d93ee6490115a" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -773,7 +773,7 @@ dependencies = [ "http 0.2.12", "http 1.4.0", "percent-encoding", - "sha2", + "sha2 0.11.0", "time", "tracing", ] @@ -864,9 +864,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.10.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" +checksum = "0504b1ab12debb5959e5165ee5fe97dd387e7aa7ea6a477bfd7635dfe769a4f5" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -889,11 +889,12 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.11.6" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" +checksum = "b71a13df6ada0aafbf21a73bdfcdf9324cfa9df77d96b8446045be3cde61b42e" dependencies = [ "aws-smithy-async", + "aws-smithy-runtime-api-macros", "aws-smithy-types", "bytes", "http 0.2.12", @@ -904,6 +905,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-smithy-runtime-api-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d7396fd9500589e62e460e987ecb671bad374934e55ec3b5f498cc7a8a8a7b7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "aws-smithy-types" version = "1.4.7" @@ -938,9 +950,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.14" +version = "1.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" +checksum = "2f4bbcaa9304ea40902d3d5f42a0428d1bd895a2b0f6999436fb279ffddc58ac" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -952,9 +964,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" dependencies = [ "axum-core", "bytes", @@ -1030,9 +1042,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" [[package]] name = "blake2" @@ -1101,7 +1113,7 @@ dependencies = [ "log", "num", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.4", "rustls", "rustls-native-certs", "rustls-pki-types", @@ -1225,9 +1237,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.57" +version = "1.2.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" dependencies = [ "find-msvc-tools", "jobserver", @@ -1255,7 +1267,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures 0.3.0", - "rand_core 0.10.0", + "rand_core 0.10.1", ] [[package]] @@ -1311,9 +1323,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -1333,9 +1345,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -1360,13 +1372,19 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.57" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" dependencies = [ "cc", ] +[[package]] +name = "cmov" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746" + [[package]] name = "colorchoice" version = "1.0.5" @@ -1674,6 +1692,15 @@ version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7ab264ea985f1bd27887d7b21ea2bb046728e05d11909ca138d700c494730db" +[[package]] +name = "ctutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e" +dependencies = [ + "cmov", +] + [[package]] name = "cty" version = "0.2.2" @@ -1730,7 +1757,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-schema", @@ -1783,7 +1810,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "rand_distr", "recursive", "regex", @@ -1802,7 +1829,7 @@ dependencies = [ [[package]] name = "datafusion-benchmarks" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -1818,7 +1845,7 @@ dependencies = [ "mimalloc", "object_store", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "serde", "serde_json", @@ -1829,7 +1856,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -1852,7 +1879,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -1874,7 +1901,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -1898,6 +1925,7 @@ dependencies = [ "regex", "rstest", "rustyline", + "serde_json", "testcontainers-modules", "tokio", "url", @@ -1905,7 +1933,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-ipc", @@ -1923,7 +1951,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand 0.9.2", + "rand 0.9.4", "recursive", "sqlparser", "tokio", @@ -1933,7 +1961,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" dependencies = [ "futures", "log", @@ -1942,7 +1970,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-compression", @@ -1969,7 +1997,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "tokio", "tokio-util", @@ -1979,7 +2007,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-ipc", @@ -2002,7 +2030,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-avro", @@ -2019,7 +2047,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2040,7 +2068,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2062,7 +2090,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2095,11 +2123,11 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" [[package]] name = "datafusion-examples" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-flight", @@ -2123,7 +2151,7 @@ dependencies = [ "nom", "object_store", "prost", - "rand 0.9.2", + "rand 0.9.4", "serde", "serde_json", "strum", @@ -2140,7 +2168,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-buffer", @@ -2156,14 +2184,14 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-schema", @@ -2187,7 +2215,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "datafusion-common", @@ -2198,7 +2226,7 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "53.0.0" +version = "53.1.0" dependencies = [ "abi_stable", "arrow", @@ -2233,7 +2261,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-buffer", @@ -2254,19 +2282,19 @@ dependencies = [ "hex", "itertools 0.14.0", "log", - "md-5", + "md-5 0.10.6", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", - "sha2", + "sha2 0.10.9", "tokio", "uuid", ] [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "criterion", @@ -2282,24 +2310,24 @@ dependencies = [ "half", "log", "num-traits", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "criterion", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-ord", @@ -2319,12 +2347,12 @@ dependencies = [ "itoa", "log", "memchr", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2337,7 +2365,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "criterion", @@ -2353,7 +2381,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2361,7 +2389,7 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" dependencies = [ "datafusion-doc", "quote", @@ -2370,7 +2398,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2397,7 +2425,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "criterion", @@ -2414,7 +2442,7 @@ dependencies = [ "itertools 0.14.0", "parking_lot", "petgraph", - "rand 0.9.2", + "rand 0.9.4", "recursive", "rstest", "tokio", @@ -2422,7 +2450,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "datafusion-common", @@ -2435,7 +2463,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "chrono", @@ -2446,12 +2474,12 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "datafusion-common", @@ -2472,7 +2500,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "arrow-data", @@ -2501,7 +2529,7 @@ dependencies = [ "num-traits", "parking_lot", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.4", "rstest", "rstest_reuse", "tokio", @@ -2509,7 +2537,7 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2539,7 +2567,7 @@ dependencies = [ "pbjson 0.9.0", "pretty_assertions", "prost", - "rand 0.9.2", + "rand 0.9.4", "serde", "serde_json", "tokio", @@ -2547,7 +2575,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "datafusion-common", @@ -2559,7 +2587,7 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "datafusion-common", @@ -2577,7 +2605,7 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" dependencies = [ "async-trait", "datafusion-common", @@ -2589,7 +2617,7 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "bigdecimal", @@ -2607,16 +2635,16 @@ dependencies = [ "log", "num-traits", "percent-encoding", - "rand 0.9.2", + "rand 0.9.4", "serde_json", "sha1 0.11.0", - "sha2", + "sha2 0.10.9", "url", ] [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "bigdecimal", @@ -2641,7 +2669,7 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "53.0.0" +version = "53.1.0" dependencies = [ "arrow", "async-trait", @@ -2673,7 +2701,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "53.0.0" +version = "53.1.0" dependencies = [ "async-recursion", "async-trait", @@ -2694,7 +2722,7 @@ dependencies = [ [[package]] name = "datafusion-wasmtest" -version = "53.0.0" +version = "53.1.0" dependencies = [ "bytes", "chrono", @@ -2751,6 +2779,7 @@ dependencies = [ "block-buffer 0.12.0", "const-oid", "crypto-common 0.2.1", + "ctutils", ] [[package]] @@ -2881,9 +2910,9 @@ dependencies = [ [[package]] name = "env_filter" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" dependencies = [ "log", "regex", @@ -2948,18 +2977,18 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "ferroid" -version = "0.8.9" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb330bbd4cb7a5b9f559427f06f98a4f853a137c8298f3bd3f8ca57663e21986" +checksum = "ee93edf3c501f0035bbeffeccfed0b79e14c311f12195ec0e661e114a0f60da4" dependencies = [ "portable-atomic", - "rand 0.9.2", + "rand 0.10.1", "web-time", ] @@ -3244,7 +3273,7 @@ dependencies = [ "cfg-if", "libc", "r-efi 6.0.0", - "rand_core 0.10.0", + "rand_core 0.10.1", "wasip2", "wasip3", ] @@ -3296,7 +3325,7 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "rand_distr", "zerocopy", ] @@ -3358,11 +3387,11 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hmac" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" dependencies = [ - "digest 0.10.7", + "digest 0.11.2", ] [[package]] @@ -3449,18 +3478,18 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hybrid-array" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a79f2aff40c18ab8615ddc5caa9eb5b96314aef18fe5823090f204ad988e813" +checksum = "3944cf8cf766b40e2a1a333ee5e9b563f854d5fa49d6a8ca2764e97c6eddb214" dependencies = [ "typenum", ] [[package]] name = "hyper" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", @@ -3473,7 +3502,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -3496,16 +3524,15 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ "http 1.4.0", "hyper", "hyper-util", "rustls", "rustls-native-certs", - "rustls-pki-types", "tokio", "tokio-rustls", "tower-service", @@ -3588,12 +3615,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -3601,9 +3629,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -3614,9 +3642,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -3628,15 +3656,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -3648,15 +3676,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -3777,9 +3805,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" dependencies = [ "memchr", "serde", @@ -3851,9 +3879,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.93" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797146bb2677299a1eb6b7b50a890f4c361b29ef967addf5b2fa45dae1bb6d7d" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" dependencies = [ "cfg-if", "futures-util", @@ -3932,9 +3960,9 @@ dependencies = [ [[package]] name = "libbz2-rs-sys" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" +checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" [[package]] name = "libc" @@ -3963,9 +3991,9 @@ dependencies = [ [[package]] name = "liblzma-sys" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" +checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" dependencies = [ "cc", "libc", @@ -3980,25 +4008,24 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libmimalloc-sys" -version = "0.1.44" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" +checksum = "bc89deee4af0429081d2a518c0431ae068222a5a262a3bc6ff4d8535ec2e02fe" dependencies = [ "cc", "cty", - "libc", ] [[package]] name = "libredox" -version = "0.1.14" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" dependencies = [ "bitflags", "libc", "plain", - "redox_syscall 0.7.3", + "redox_syscall 0.7.4", ] [[package]] @@ -4027,9 +4054,9 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "lock_api" @@ -4077,6 +4104,16 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "md-5" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98" +dependencies = [ + "cfg-if", + "digest 0.11.2", +] + [[package]] name = "memchr" version = "2.8.0" @@ -4085,9 +4122,9 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "mimalloc" -version = "0.1.48" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8" +checksum = "aca3c01a711f395b4257b81674c0e90e8dd1f1e62c4b7db45f684cc7a4fcb18a" dependencies = [ "libmimalloc-sys", ] @@ -4218,9 +4255,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-integer" @@ -4319,7 +4356,7 @@ dependencies = [ "humantime", "hyper", "itertools 0.14.0", - "md-5", + "md-5 0.10.6", "parking_lot", "percent-encoding", "quick-xml", @@ -4639,9 +4676,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.32" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "plain" @@ -4685,18 +4722,18 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" dependencies = [ "portable-atomic", ] [[package]] name = "postgres-derive" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6" +checksum = "ca1dad89d9ffdbf78502fde418eeede499b87772d88be780478f7f76dc8d471f" dependencies = [ "heck", "proc-macro2", @@ -4706,19 +4743,19 @@ dependencies = [ [[package]] name = "postgres-protocol" -version = "0.6.10" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ee9dd5fe15055d2b6806f4736aa0c9637217074e224bbec46d4041b91bb9491" +checksum = "56201207dac53e2f38e848e31b4b91616a6bb6e0c7205b77718994a7f49e70fc" dependencies = [ "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", "hmac", - "md-5", + "md-5 0.11.0", "memchr", - "rand 0.9.2", - "sha2", + "rand 0.10.1", + "sha2 0.11.0", "stringprep", ] @@ -4737,9 +4774,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -4906,7 +4943,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -4965,9 +5002,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", "rand_chacha 0.3.1", @@ -4976,9 +5013,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -4992,7 +5029,7 @@ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", - "rand_core 0.10.0", + "rand_core 0.10.1", ] [[package]] @@ -5035,9 +5072,9 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" [[package]] name = "rand_distr" @@ -5046,14 +5083,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "rayon" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" dependencies = [ "either", "rayon-core", @@ -5100,9 +5137,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" dependencies = [ "bitflags", ] @@ -5290,15 +5327,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", - "rand 0.8.5", + "rand 0.8.6", "syn 2.0.117", ] [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustc_version" @@ -5324,9 +5361,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21" dependencies = [ "aws-lc-rs", "log", @@ -5683,6 +5720,17 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.2", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -5710,9 +5758,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "simdutf8" @@ -5792,9 +5840,9 @@ dependencies = [ "humantime", "itertools 0.13.0", "libtest-mimic", - "md-5", + "md-5 0.10.6", "owo-colors", - "rand 0.8.5", + "rand 0.8.6", "regex", "similar", "subst", @@ -6022,14 +6070,14 @@ dependencies = [ "chrono-tz", "datafusion-common", "env_logger", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "testcontainers" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bd36b06a2a6c0c3c81a83be1ab05fe86460d054d4d51bf513bc56b3e15bdc22" +checksum = "bfd5785b5483672915ed5fe3cddf9f546802779fc1eceff0a6fb7321fac81c1e" dependencies = [ "astral-tokio-tar", "async-trait", @@ -6147,9 +6195,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -6182,9 +6230,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91135f59b1cbf38c91e73cf3386fca9bb77915c45ce2771460c9d92f0f3d776" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -6271,18 +6319,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.1+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.5+spec-1.1.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ "indexmap 2.14.0", "toml_datetime", @@ -6292,9 +6340,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.10+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ "winnow", ] @@ -6480,15 +6528,15 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" [[package]] name = "typewit" -version = "1.14.2" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" +checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" [[package]] name = "typify" @@ -6608,9 +6656,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "3.2.1" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ab5172ab0c2b6d01a9bb4f9332f7c1211193ea002742188040d09ea4eafe867" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" dependencies = [ "base64 0.22.1", "log", @@ -6623,9 +6671,9 @@ dependencies = [ [[package]] name = "ureq-proto" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" dependencies = [ "base64 0.22.1", "http 1.4.0", @@ -6672,9 +6720,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.0" +version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -6735,11 +6783,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", ] [[package]] @@ -6748,7 +6796,7 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.51.0", ] [[package]] @@ -6762,9 +6810,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.116" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dc0882f7b5bb01ae8c5215a1230832694481c1a4be062fd410e12ea3da5b631" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", "once_cell", @@ -6775,9 +6823,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.66" +version = "0.4.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19280959e2844181895ef62f065c63e0ca07ece4771b53d89bfdb967d97cbf05" +checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8" dependencies = [ "js-sys", "wasm-bindgen", @@ -6785,9 +6833,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.116" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75973d3066e01d035dbedaad2864c398df42f8dd7b1ea057c35b8407c015b537" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6795,9 +6843,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.116" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91af5e4be765819e0bcfee7322c14374dc821e35e72fa663a830bbc7dc199eac" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ "bumpalo", "proc-macro2", @@ -6808,18 +6856,18 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.116" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9bf0406a78f02f336bf1e451799cca198e8acde4ffa278f0fb20487b150a633" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" dependencies = [ "unicode-ident", ] [[package]] name = "wasm-bindgen-test" -version = "0.3.66" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea88331fc76766356287e79bb0bc032157feea8eff8f2c3f1d9ea4b94255ae1c" +checksum = "6bb55e2540ad1c56eec35fd63e2aea15f83b11ce487fd2de9ad11578dfc047ea" dependencies = [ "async-trait", "cast", @@ -6839,9 +6887,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.66" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92437fa87f58743befb3003c4f4e3e9010dd50c6935561be7645981c0de05dfd" +checksum = "caf0ca1bd612b988616bac1ab34c4e4290ef18f7148a1d8b7f31c150080e9295" dependencies = [ "proc-macro2", "quote", @@ -6850,9 +6898,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-shared" -version = "0.2.116" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10091e48e3231b0f567b098ddb9a107310eb2629ae0eaa7c98dd746d5e80ee78" +checksum = "23cda5ecc67248c48d3e705d3e03e00af905769b78b9d2a1678b663b8b9d4472" [[package]] name = "wasm-encoder" @@ -6903,9 +6951,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.93" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "749466a37ee189057f54748b200186b59a03417a117267baf3fd89cecc9fb837" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" dependencies = [ "js-sys", "wasm-bindgen", @@ -7242,9 +7290,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" dependencies = [ "memchr", ] @@ -7258,6 +7306,12 @@ dependencies = [ "wit-bindgen-rust-macro", ] +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "wit-bindgen-core" version = "0.51.0" @@ -7339,9 +7393,9 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "xattr" @@ -7367,9 +7421,9 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -7378,9 +7432,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", @@ -7390,18 +7444,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", @@ -7410,18 +7464,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", @@ -7437,9 +7491,9 @@ checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -7448,9 +7502,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -7459,9 +7513,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 8d90a11858a45..59707ba8e3f27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,7 +79,7 @@ repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) rust-version = "1.88.0" # Define DataFusion version -version = "53.0.0" +version = "53.1.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -117,43 +117,43 @@ chrono = { version = "0.4.44", default-features = false } criterion = "0.8" ctor = "0.10.0" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "53.0.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "53.0.0" } -datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "53.0.0" } -datafusion-common = { path = "datafusion/common", version = "53.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "53.0.0" } -datafusion-datasource = { path = "datafusion/datasource", version = "53.0.0", default-features = false } -datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "53.0.0", default-features = false } -datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "53.0.0", default-features = false } -datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "53.0.0", default-features = false } -datafusion-datasource-json = { path = "datafusion/datasource-json", version = "53.0.0", default-features = false } -datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "53.0.0", default-features = false } -datafusion-doc = { path = "datafusion/doc", version = "53.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "53.0.0", default-features = false } -datafusion-expr = { path = "datafusion/expr", version = "53.0.0", default-features = false } -datafusion-expr-common = { path = "datafusion/expr-common", version = "53.0.0" } -datafusion-ffi = { path = "datafusion/ffi", version = "53.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "53.0.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "53.0.0" } -datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "53.0.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "53.0.0", default-features = false } -datafusion-functions-table = { path = "datafusion/functions-table", version = "53.0.0" } -datafusion-functions-window = { path = "datafusion/functions-window", version = "53.0.0" } -datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "53.0.0" } -datafusion-macros = { path = "datafusion/macros", version = "53.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "53.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "53.0.0", default-features = false } -datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "53.0.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "53.0.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "53.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "53.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "53.0.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "53.0.0" } -datafusion-pruning = { path = "datafusion/pruning", version = "53.0.0" } -datafusion-session = { path = "datafusion/session", version = "53.0.0" } -datafusion-spark = { path = "datafusion/spark", version = "53.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "53.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "53.0.0" } +datafusion = { path = "datafusion/core", version = "53.1.0", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "53.1.0" } +datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "53.1.0" } +datafusion-common = { path = "datafusion/common", version = "53.1.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "53.1.0" } +datafusion-datasource = { path = "datafusion/datasource", version = "53.1.0", default-features = false } +datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "53.1.0", default-features = false } +datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "53.1.0", default-features = false } +datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "53.1.0", default-features = false } +datafusion-datasource-json = { path = "datafusion/datasource-json", version = "53.1.0", default-features = false } +datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "53.1.0", default-features = false } +datafusion-doc = { path = "datafusion/doc", version = "53.1.0" } +datafusion-execution = { path = "datafusion/execution", version = "53.1.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "53.1.0", default-features = false } +datafusion-expr-common = { path = "datafusion/expr-common", version = "53.1.0" } +datafusion-ffi = { path = "datafusion/ffi", version = "53.1.0" } +datafusion-functions = { path = "datafusion/functions", version = "53.1.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "53.1.0" } +datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "53.1.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "53.1.0", default-features = false } +datafusion-functions-table = { path = "datafusion/functions-table", version = "53.1.0" } +datafusion-functions-window = { path = "datafusion/functions-window", version = "53.1.0" } +datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "53.1.0" } +datafusion-macros = { path = "datafusion/macros", version = "53.1.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "53.1.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "53.1.0", default-features = false } +datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "53.1.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "53.1.0", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "53.1.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "53.1.0" } +datafusion-proto = { path = "datafusion/proto", version = "53.1.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "53.1.0" } +datafusion-pruning = { path = "datafusion/pruning", version = "53.1.0" } +datafusion-session = { path = "datafusion/session", version = "53.1.0" } +datafusion-spark = { path = "datafusion/spark", version = "53.1.0" } +datafusion-sql = { path = "datafusion/sql", version = "53.1.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "53.1.0" } doc-comment = "0.3" env_logger = "0.11" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 40e0e50dacd7a..414b8c6444869 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -37,7 +37,7 @@ backtrace = ["datafusion/backtrace"] [dependencies] arrow = { workspace = true } async-trait = { workspace = true } -aws-config = "1.8.14" +aws-config = "1.8.16" aws-credential-types = "1.2.13" chrono = { workspace = true } clap = { version = "4.5.60", features = ["cargo", "derive"] } @@ -78,3 +78,11 @@ insta = { workspace = true } insta-cmd = "0.6.0" rstest = { workspace = true } testcontainers-modules = { workspace = true, features = ["minio"] } +# Makes sure `test_display_pg_json` behaves in a consistent way regardless of +# feature unification with dependencies +serde_json = { workspace = true, features = ["preserve_order"] } + +# Required because we pull serde_json with a feature to get consistent pg display, +# but its not directly used. +[package.metadata.cargo-machete] +ignored = "serde_json" diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index 3cecba75e21b0..be4a2ad4fe197 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -261,11 +261,11 @@ fn bind_to_settings(snapshot_name: &str) -> SettingsBindDropGuard { "Consumer(can spill: bool) consumed XB, peak XB", ); settings.add_filter( - r"Error: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total pool", + r"Error: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total memory pool: '.*?'", "Error: Failed to allocate ", ); settings.add_filter( - r"Resources exhausted: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total pool", + r"Resources exhausted: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total memory pool: '.*?'", "Resources exhausted: Failed to allocate", ); diff --git a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap index 1359cefbe71c7..5f43ca88dc9d7 100644 --- a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap +++ b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap @@ -18,19 +18,19 @@ exit_code: 0 | logical_plan | [ | | | { | | | "Plan": { | -| | "Expressions": [ | -| | "Int64(123)" | -| | ], | | | "Node Type": "Projection", | -| | "Output": [ | +| | "Expressions": [ | | | "Int64(123)" | | | ], | | | "Plans": [ | | | { | | | "Node Type": "EmptyRelation", | -| | "Output": [], | -| | "Plans": [] | +| | "Plans": [], | +| | "Output": [] | | | } | +| | ], | +| | "Output": [ | +| | "Int64(123)" | | | ] | | | } | | | } | diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap index fe454595eb4bc..c34e1202f55da 100644 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap @@ -16,6 +16,6 @@ exit_code: 1 [CLI_VERSION] Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. caused by -Resources exhausted: Failed to allocate +Resources exhausted: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB) ----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap index bb30e387166bc..ebf7a540d8d44 100644 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap @@ -19,6 +19,6 @@ caused by Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as: Consumer(can spill: bool) consumed XB, peak XB, Consumer(can spill: bool) consumed XB, peak XB. -Error: Failed to allocate +Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB) ----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap index 891d72e3cc639..9e279ca93ddcd 100644 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap @@ -18,6 +18,6 @@ Resources exhausted: Additional allocation failed for ExternalSorter[0] with top Consumer(can spill: bool) consumed XB, peak XB, Consumer(can spill: bool) consumed XB, peak XB, Consumer(can spill: bool) consumed XB, peak XB. -Error: Failed to allocate +Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB) ----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap index 25267ea1617e5..9a228fcfb6e93 100644 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap @@ -18,6 +18,6 @@ exit_code: 1 [CLI_VERSION] Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. caused by -Resources exhausted: Failed to allocate +Resources exhausted: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 10.0 MB) ----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap index 6515050047107..d7f964a339313 100644 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap @@ -21,6 +21,6 @@ caused by Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as: Consumer(can spill: bool) consumed XB, peak XB, Consumer(can spill: bool) consumed XB, peak XB. -Error: Failed to allocate +Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 10.0 MB) ----- stderr ----- diff --git a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs index af3031c690fa3..d849a033bc66b 100644 --- a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs +++ b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs @@ -119,7 +119,8 @@ async fn automatic_usage_example() -> Result<()> { ExternalSorter[1]#93(can spill: true) consumed 69.0 KB, peak 69.0 KB, ExternalSorter[13]#155(can spill: true) consumed 67.6 KB, peak 67.6 KB, ExternalSorter[8]#140(can spill: true) consumed 67.2 KB, peak 67.2 KB. - Error: Failed to allocate additional 10.0 MB for ExternalSorterMerge[0] with 0.0 B already allocated for this reservation - 7.1 MB remain available for the total pool + Error: Failed to allocate additional 10.0 MB for ExternalSorterMerge[0] with 0.0 B already allocated + for this reservation - 7.1 MB remain available for the total memory pool */ } } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index e7d9e809daecc..e3da99163ed69 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -21,7 +21,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::fmt::{Display, Formatter}; use std::hash::Hash; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use crate::error::{_plan_err, _schema_err, DataFusionError, Result}; use crate::{ @@ -129,6 +129,13 @@ impl DFSchema { } } + /// Returns a reference to a shared empty [`DFSchema`]. + pub fn empty_ref() -> &'static DFSchemaRef { + static EMPTY: LazyLock = + LazyLock::new(|| Arc::new(DFSchema::empty())); + &EMPTY + } + /// Return a reference to the inner Arrow [`Schema`] /// /// Note this does not have the qualifier information diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index 1e7c02e424256..39300b9564621 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -796,7 +796,9 @@ pub trait TreeNodeContainer<'a, T: 'a>: Sized { ) -> Result>; } -impl<'a, T: 'a, C: TreeNodeContainer<'a, T>> TreeNodeContainer<'a, T> for Box { +impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Default> TreeNodeContainer<'a, T> + for Box +{ fn apply_elements Result>( &'a self, f: F, @@ -805,14 +807,24 @@ impl<'a, T: 'a, C: TreeNodeContainer<'a, T>> TreeNodeContainer<'a, T> for Box } fn map_elements Result>>( - self, + mut self, f: F, ) -> Result> { - (*self).map_elements(f)?.map_data(|c| Ok(Self::new(c))) + // Rewrite in place so the existing heap allocation can be reused. + // `mem::take` hands the inner `C` to `f` while leaving + // `C::default()` in the slot, so an unwinding drop finds a valid + // `C` even if `f` panics or the `?` short-circuits. + let inner = std::mem::take(&mut *self); + Ok(inner.map_elements(f)?.update_data(|c| { + *self = c; + self + })) } } -impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone> TreeNodeContainer<'a, T> for Arc { +impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone + Default> TreeNodeContainer<'a, T> + for Arc +{ fn apply_elements Result>( &'a self, f: F, @@ -821,12 +833,18 @@ impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone> TreeNodeContainer<'a, T> fo } fn map_elements Result>>( - self, + mut self, f: F, ) -> Result> { - Arc::unwrap_or_clone(self) - .map_elements(f)? - .map_data(|c| Ok(Arc::new(c))) + // Rewrite in place using the same `mem::take` strategy as + // `Box::map_elements`. `Arc::make_mut` gives us exclusive + // access (cloning `C` first if we were sharing), after which + // `get_mut` is infallible. + let inner = std::mem::take(Arc::make_mut(&mut self)); + Ok(inner.map_elements(f)?.update_data(|c| { + *Arc::get_mut(&mut self).unwrap() = c; + self + })) } } @@ -1335,6 +1353,7 @@ impl TreeNode for T { pub(crate) mod tests { use std::collections::HashMap; use std::fmt::Display; + use std::sync::Arc; use crate::Result; use crate::tree_node::{ @@ -1342,7 +1361,7 @@ pub(crate) mod tests { TreeNodeVisitor, }; - #[derive(Debug, Eq, Hash, PartialEq, Clone)] + #[derive(Debug, Default, Eq, Hash, PartialEq, Clone)] pub struct TestTreeNode { pub(crate) children: Vec>, pub(crate) data: T, @@ -2431,4 +2450,46 @@ pub(crate) mod tests { item.visit(&mut visitor).unwrap(); } + + #[test] + fn box_map_elements_reuses_allocation() { + let boxed = Box::new(TestTreeNode::new_leaf(42i32)); + let before: *const TestTreeNode = &*boxed; + let out = boxed.map_elements(|n| Ok(Transformed::no(n))).unwrap(); + let after: *const TestTreeNode = &*out.data; + assert_eq!(after, before); + } + + #[test] + fn arc_map_elements_reuses_allocation_when_unique() { + let arc = Arc::new(TestTreeNode::new_leaf(42i32)); + let before = Arc::as_ptr(&arc); + let out = arc.map_elements(|n| Ok(Transformed::no(n))).unwrap(); + assert_eq!(Arc::as_ptr(&out.data), before); + } + + #[test] + fn arc_map_elements_clones_when_shared() { + // When the input `Arc` is shared, `make_mut` clones into a fresh + // allocation, so the reuse optimization does not apply. + let arc = Arc::new(TestTreeNode::new_leaf(42i32)); + let _keepalive = Arc::clone(&arc); + let before = Arc::as_ptr(&arc); + let out = arc.map_elements(|n| Ok(Transformed::no(n))).unwrap(); + assert_ne!(Arc::as_ptr(&out.data), before); + } + + #[test] + fn box_map_elements_panic() { + use std::panic::{AssertUnwindSafe, catch_unwind}; + let boxed = Box::new(TestTreeNode::new_leaf(42i32)); + let result = catch_unwind(AssertUnwindSafe(|| { + boxed + .map_elements(|_: TestTreeNode| -> Result<_> { + panic!("simulated panic during rewrite") + }) + .ok() + })); + assert!(result.is_err()); + } } diff --git a/datafusion/datasource-parquet/src/access_plan.rs b/datafusion/datasource-parquet/src/access_plan.rs index ca4d097c37a44..32d9795d605de 100644 --- a/datafusion/datasource-parquet/src/access_plan.rs +++ b/datafusion/datasource-parquet/src/access_plan.rs @@ -349,6 +349,77 @@ impl ParquetAccessPlan { PreparedAccessPlan::new(row_group_indexes, row_selection) } + + /// Split this plan into an ordered list of sub-plans ("chunks"), each of + /// which represents a contiguous prefix of work packed together. + /// + /// Each returned plan has the same `len()` as `self`. Row groups outside + /// the chunk are set to [`RowGroupAccess::Skip`]; row groups inside the + /// chunk keep their original [`RowGroupAccess`]. + /// + /// Chunks are formed by walking `self.row_groups` in order and grouping + /// consecutive entries with `should_scan() == true`. A new chunk is started + /// whenever adding the next scannable row group would push the accumulated + /// row count past `max_rows` or compressed byte size past `max_bytes`. A + /// single row group that already exceeds either limit becomes its own + /// chunk (no sub-row-group split is performed). + /// + /// [`RowGroupAccess::Skip`] entries are carried silently in whichever chunk + /// is active at that point; they contribute no rows or bytes. + /// + /// If there are no scannable row groups, the result is empty. + pub(crate) fn split_into_chunks( + self, + row_group_meta_data: &[RowGroupMetaData], + max_rows: u64, + max_bytes: u64, + ) -> Vec { + assert_eq!(self.row_groups.len(), row_group_meta_data.len()); + + let len = self.row_groups.len(); + let mut chunks: Vec = Vec::new(); + let mut current: Option<(ParquetAccessPlan, u64, u64)> = None; + + for (idx, access) in self.row_groups.into_iter().enumerate() { + if !access.should_scan() { + // Skip entries are attached to the currently open chunk (if + // any) so they do not force a chunk boundary. They contribute + // zero rows/bytes. + if let Some((plan, _, _)) = current.as_mut() { + plan.row_groups[idx] = access; + } + continue; + } + + let rg_meta = &row_group_meta_data[idx]; + let rg_rows = rg_meta.num_rows().max(0) as u64; + let rg_bytes = rg_meta.compressed_size().max(0) as u64; + + if let Some((plan, acc_rows, acc_bytes)) = current.as_mut() { + let exceeds = acc_rows.saturating_add(rg_rows) > max_rows + || acc_bytes.saturating_add(rg_bytes) > max_bytes; + if exceeds { + chunks.push(current.take().unwrap().0); + } else { + plan.row_groups[idx] = access; + *acc_rows += rg_rows; + *acc_bytes += rg_bytes; + continue; + } + } + + // Start a new chunk with this row group. + let mut plan = ParquetAccessPlan::new_none(len); + plan.row_groups[idx] = access; + current = Some((plan, rg_rows, rg_bytes)); + } + + if let Some((plan, _, _)) = current { + chunks.push(plan); + } + + chunks + } } /// Represents a prepared, fully resolved [`ParquetAccessPlan`] @@ -600,6 +671,180 @@ mod test { .collect() }); + /// Build metadata for row groups with the given `(num_rows, compressed_bytes)` + /// pairs. Returned metadata has one `BYTE_ARRAY` column per row group. + fn row_groups_with_bytes(specs: &[(i64, i64)]) -> Vec { + let schema_descr = get_test_schema_descr(); + specs + .iter() + .map(|(num_rows, compressed)| { + let column = ColumnChunkMetaData::builder(schema_descr.column(0)) + .set_num_values(*num_rows) + .set_total_compressed_size(*compressed) + .build() + .unwrap(); + + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(*num_rows) + .set_column_metadata(vec![column]) + .build() + .unwrap() + }) + .collect() + } + + fn access_kinds(plan: &ParquetAccessPlan) -> Vec<&'static str> { + plan.inner() + .iter() + .map(|rg| match rg { + RowGroupAccess::Skip => "skip", + RowGroupAccess::Scan => "scan", + RowGroupAccess::Selection(_) => "sel", + }) + .collect() + } + + #[test] + fn test_split_into_chunks_empty() { + let plan = ParquetAccessPlan::new(vec![]); + let chunks = plan.split_into_chunks(&[], 1000, 1000); + assert!(chunks.is_empty()); + } + + #[test] + fn test_split_into_chunks_all_skip() { + let meta = row_groups_with_bytes(&[(100, 1_000), (100, 1_000)]); + let plan = ParquetAccessPlan::new_none(2); + let chunks = plan.split_into_chunks(&meta, 1000, 10_000); + assert!(chunks.is_empty()); + } + + #[test] + fn test_split_into_chunks_one_per_row_group() { + // Each row group is already at the per-morsel limit, so each becomes + // its own chunk. + let meta = row_groups_with_bytes(&[(100, 1_000), (100, 1_000), (100, 1_000)]); + let plan = ParquetAccessPlan::new_all(3); + let chunks = plan.split_into_chunks(&meta, 100, 1_000); + assert_eq!(chunks.len(), 3); + assert_eq!(access_kinds(&chunks[0]), vec!["scan", "skip", "skip"]); + assert_eq!(access_kinds(&chunks[1]), vec!["skip", "scan", "skip"]); + assert_eq!(access_kinds(&chunks[2]), vec!["skip", "skip", "scan"]); + } + + #[test] + fn test_split_into_chunks_packs_small() { + // Three small row groups fit within one chunk by rows AND bytes. + let meta = row_groups_with_bytes(&[(30, 100), (30, 100), (30, 100)]); + let plan = ParquetAccessPlan::new_all(3); + let chunks = plan.split_into_chunks(&meta, 100, 1_000); + assert_eq!(chunks.len(), 1); + assert_eq!(access_kinds(&chunks[0]), vec!["scan", "scan", "scan"]); + } + + #[test] + fn test_split_into_chunks_oversized_single() { + // First row group alone exceeds max_rows; still becomes its own chunk + // (no sub-row-group split). + let meta = row_groups_with_bytes(&[(1_000, 100), (10, 100), (10, 100)]); + let plan = ParquetAccessPlan::new_all(3); + let chunks = plan.split_into_chunks(&meta, 100, 10_000); + assert_eq!(chunks.len(), 2); + assert_eq!(access_kinds(&chunks[0]), vec!["scan", "skip", "skip"]); + assert_eq!(access_kinds(&chunks[1]), vec!["skip", "scan", "scan"]); + } + + #[test] + fn test_split_into_chunks_respects_bytes() { + // All row groups are small in rows but the second one is big enough + // that it must start a new chunk on byte budget alone. + let meta = row_groups_with_bytes(&[(10, 500), (10, 600), (10, 100), (10, 100)]); + let plan = ParquetAccessPlan::new_all(4); + let chunks = plan.split_into_chunks(&meta, 1_000_000, 1_000); + assert_eq!(chunks.len(), 2); + assert_eq!( + access_kinds(&chunks[0]), + vec!["scan", "skip", "skip", "skip"] + ); + assert_eq!( + access_kinds(&chunks[1]), + vec!["skip", "scan", "scan", "scan"] + ); + } + + #[test] + fn test_split_into_chunks_with_skip_preserved() { + // Skip entries are carried by whichever chunk is currently being + // grown and never contribute to the row/byte budget, so here all + // three scan row groups fit together despite the wide skip in the + // middle. + let meta = + row_groups_with_bytes(&[(30, 100), (1_000, 500), (30, 100), (30, 100)]); + let plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + RowGroupAccess::Skip, + RowGroupAccess::Scan, + RowGroupAccess::Scan, + ]); + let chunks = plan.split_into_chunks(&meta, 100, 1_000); + assert_eq!(chunks.len(), 1); + assert_eq!( + access_kinds(&chunks[0]), + vec!["scan", "skip", "scan", "scan"] + ); + } + + #[test] + fn test_split_into_chunks_skip_between_chunks() { + // When a chunk closes on budget, a following Skip is picked up by the + // next chunk rather than creating an empty one. + let meta = row_groups_with_bytes(&[(50, 100), (50, 100), (50, 100), (50, 100)]); + let plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + RowGroupAccess::Scan, + RowGroupAccess::Skip, + RowGroupAccess::Scan, + ]); + let chunks = plan.split_into_chunks(&meta, 100, 10_000); + assert_eq!(chunks.len(), 2); + assert_eq!( + access_kinds(&chunks[0]), + vec!["scan", "scan", "skip", "skip"] + ); + // rg2's Skip still lives in chunk 0 because chunk 0 was still open + // when we hit rg2; chunk 1 only covers rg3. + assert_eq!( + access_kinds(&chunks[1]), + vec!["skip", "skip", "skip", "scan"] + ); + } + + #[test] + fn test_split_into_chunks_preserves_selection() { + let meta = row_groups_with_bytes(&[(10, 100), (20, 100), (30, 100)]); + let selection: RowSelection = + vec![RowSelector::select(5), RowSelector::skip(15)].into(); + let plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + RowGroupAccess::Selection(selection), + RowGroupAccess::Scan, + ]); + // Budget forces each row group into its own chunk. + let chunks = plan.split_into_chunks(&meta, 15, 10_000); + assert_eq!(chunks.len(), 3); + assert_eq!(access_kinds(&chunks[0]), vec!["scan", "skip", "skip"]); + assert_eq!(access_kinds(&chunks[1]), vec!["skip", "sel", "skip"]); + assert_eq!(access_kinds(&chunks[2]), vec!["skip", "skip", "scan"]); + // The Selection must be preserved verbatim in its chunk. + let RowGroupAccess::Selection(sel) = &chunks[1].inner()[1] else { + panic!("expected Selection preserved in chunk"); + }; + let selectors: Vec<_> = sel.clone().into(); + assert_eq!(selectors.len(), 2); + assert_eq!((selectors[0].skip, selectors[0].row_count), (false, 5)); + assert_eq!((selectors[1].skip, selectors[1].row_count), (true, 15)); + } + /// Single column schema with a single column named "a" of type `BYTE_ARRAY`/`String` fn get_test_schema_descr() -> SchemaDescPtr { use parquet::basic::Type as PhysicalType; diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index bad1c684b47f5..b548f1f2bde36 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -75,7 +75,16 @@ use parquet::arrow::parquet_column; use parquet::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; use parquet::basic::Type; use parquet::bloom_filter::Sbbf; -use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; + +/// Default soft upper bound on the number of rows packed into a single +/// row-group morsel. Adjacent row groups are coalesced until this limit would +/// be exceeded. A single oversized row group still becomes its own morsel. +pub(crate) const DEFAULT_MORSEL_MAX_ROWS: u64 = 100_000; + +/// Default soft upper bound on the compressed byte size of a single row-group +/// morsel. See [`DEFAULT_MORSEL_MAX_ROWS`]. +pub(crate) const DEFAULT_MORSEL_MAX_COMPRESSED_BYTES: u64 = 64 * 1024 * 1024; /// Stateless Parquet morselizer implementation. /// @@ -136,6 +145,15 @@ pub(super) struct ParquetMorselizer { pub max_predicate_cache_size: Option, /// Whether to read row groups in reverse order pub reverse_row_groups: bool, + /// Upper bound on the number of rows coalesced into a single morsel. + /// + /// Row groups are packed greedily until the next row group would push + /// the total past this limit; a single oversized row group still becomes + /// its own morsel. + pub morsel_max_rows: u64, + /// Upper bound on the compressed byte size coalesced into a single + /// morsel. See [`Self::morsel_max_rows`]. + pub morsel_max_compressed_bytes: u64, } impl fmt::Debug for ParquetMorselizer { @@ -228,8 +246,15 @@ enum ParquetOpenState { /// /// TODO: split state as this currently does both I/O and CPU work. BuildStream(Box), - /// Terminal state: the final opened stream is ready to return. - Ready(BoxStream<'static, Result>), + /// Terminal state: one or more per-morsel lazy builders are ready to + /// return. + /// + /// Each morsel corresponds to one row-group-sized chunk of the file. + /// Morsels defer row-filter compilation, decoder construction, and + /// reader acquisition until [`Morsel::into_stream`] is actually + /// invoked — so construction work for a morsel only happens when the + /// scheduler picks it up. + Ready(Vec>), /// Terminal state: reading complete Done, } @@ -287,6 +312,8 @@ struct PreparedParquetOpen { max_predicate_cache_size: Option, reverse_row_groups: bool, preserve_order: bool, + morsel_max_rows: u64, + morsel_max_compressed_bytes: u64, #[cfg(feature = "parquet_encryption")] file_decryption_properties: Option>, } @@ -399,7 +426,7 @@ impl ParquetOpenState { ParquetOpenState::BuildStream(prepared) => { Ok(ParquetOpenState::Ready(prepared.build_stream()?)) } - ParquetOpenState::Ready(stream) => Ok(ParquetOpenState::Ready(stream)), + ParquetOpenState::Ready(streams) => Ok(ParquetOpenState::Ready(streams)), ParquetOpenState::Done => { panic!("ParquetOpenFuture polled after completion"); } @@ -407,27 +434,200 @@ impl ParquetOpenState { } } -/// Implements the Morsel API -struct ParquetStreamMorsel { - stream: BoxStream<'static, Result>, +/// File-level state shared across every lazy morsel from a single file open. +/// +/// Each [`ParquetLazyMorsel`] holds an `Arc` to one of these so the +/// expensive-to-clone pieces (metadata, schemas, metrics, Arc predicates) +/// are not duplicated. The only non-shareable resource is the +/// [`FilePruner`], which is held on chunk 0's morsel because it's +/// `!Clone`. +struct LazyMorselShared { + partition_index: usize, + partitioned_file: PartitionedFile, + metadata_size_hint: Option, + metrics: ExecutionPlanMetricsSet, + file_metrics: ParquetFileMetrics, + baseline_metrics: BaselineMetrics, + parquet_file_reader_factory: Arc, + batch_size: usize, + physical_file_schema: SchemaRef, + output_schema: SchemaRef, + projection: ProjectionExprs, + predicate: Option>, + pushdown_filters: bool, + force_filter_selections: bool, + reorder_predicates: bool, + limit: Option, + max_predicate_cache_size: Option, + reverse_row_groups: bool, + reader_metadata: ArrowReaderMetadata, + file_metadata: Arc, } -impl ParquetStreamMorsel { - fn new(stream: BoxStream<'static, Result>) -> Self { - Self { stream } - } +/// Lazy per-morsel builder. +/// +/// Holds everything needed to construct the parquet decoder stream for a +/// single chunk of row groups, but defers the actual construction — +/// `build_row_filter`, decoder build, reader acquisition — to +/// [`Morsel::into_stream`]. This means a file's morsel construction cost +/// is paid only as each morsel is scheduled, not all-at-once at +/// `build_stream` time. +struct ParquetLazyMorsel { + shared: Arc, + chunk_plan: ParquetAccessPlan, + chunk_idx: usize, + /// The file-level [`FilePruner`] used for dynamic-filter early-stop. + /// `FilePruner` is not `Clone` and holds stateful predicate-generation + /// counters, so it's attached only to chunk 0's stream. + file_pruner: Option, } -impl fmt::Debug for ParquetStreamMorsel { +impl fmt::Debug for ParquetLazyMorsel { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ParquetStreamMorsel") + f.debug_struct("ParquetLazyMorsel") + .field("chunk_idx", &self.chunk_idx) .finish_non_exhaustive() } } -impl Morsel for ParquetStreamMorsel { +impl Morsel for ParquetLazyMorsel { fn into_stream(self: Box) -> BoxStream<'static, Result> { - self.stream + match (*self).build_stream_now() { + Ok(stream) => stream, + Err(e) => futures::stream::once(async move { Err(e) }).boxed(), + } + } +} + +impl ParquetLazyMorsel { + fn build_stream_now(self) -> Result>> { + let ParquetLazyMorsel { + shared, + chunk_plan, + chunk_idx, + file_pruner, + } = self; + + let rg_metadata = shared.file_metadata.row_groups(); + let mut prepared_plan = chunk_plan.prepare(rg_metadata)?; + if shared.reverse_row_groups { + prepared_plan = prepared_plan.reverse(shared.file_metadata.as_ref())?; + } + + // `RowFilter` is not `Clone` because it owns `Box`s, + // so a fresh filter has to be built per chunk. + let row_filter = if let Some(predicate) = shared + .pushdown_filters + .then_some(shared.predicate.clone()) + .flatten() + { + match row_filter::build_row_filter( + &predicate, + &shared.physical_file_schema, + shared.file_metadata.as_ref(), + shared.reorder_predicates, + &shared.file_metrics, + ) { + Ok(Some(filter)) => Some(filter), + Ok(None) => None, + Err(e) => { + debug!("Ignoring error building row filter for '{predicate:?}': {e}"); + None + } + } + } else { + None + }; + + let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + let read_plan = build_projection_read_plan( + shared.projection.expr_iter(), + &shared.physical_file_schema, + shared.reader_metadata.parquet_schema(), + ); + + let mut decoder_builder = + ParquetPushDecoderBuilder::new_with_metadata(shared.reader_metadata.clone()) + .with_projection(read_plan.projection_mask) + .with_batch_size(shared.batch_size) + .with_metrics(arrow_reader_metrics.clone()); + + if let Some(row_filter) = row_filter { + decoder_builder = decoder_builder.with_row_filter(row_filter); + } + if shared.force_filter_selections { + decoder_builder = + decoder_builder.with_row_selection_policy(RowSelectionPolicy::Selectors); + } + if let Some(row_selection) = prepared_plan.row_selection { + decoder_builder = decoder_builder.with_row_selection(row_selection); + } + decoder_builder = + decoder_builder.with_row_groups(prepared_plan.row_group_indexes); + // `ScanState.remain` enforces the true outer limit across all + // morsels; passing the per-chunk limit here is a conservative + // per-chunk cap that bounds wasted decode once the outer cap is hit. + if let Some(limit) = shared.limit { + decoder_builder = decoder_builder.with_limit(limit); + } + if let Some(max_predicate_cache_size) = shared.max_predicate_cache_size { + decoder_builder = + decoder_builder.with_max_predicate_cache_size(max_predicate_cache_size); + } + + let decoder = decoder_builder.build()?; + + let reader = shared.parquet_file_reader_factory.create_reader( + shared.partition_index, + shared.partitioned_file.clone(), + shared.metadata_size_hint, + &shared.metrics, + )?; + + let stream_schema = read_plan.projected_schema; + let replace_schema = stream_schema != shared.output_schema; + let projection = shared + .projection + .clone() + .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; + let projector = projection.make_projector(&stream_schema)?; + + let predicate_cache_inner_records = + shared.file_metrics.predicate_cache_inner_records.clone(); + let predicate_cache_records = shared.file_metrics.predicate_cache_records.clone(); + + let stream = futures::stream::unfold( + PushDecoderStreamState { + decoder, + reader, + projector, + output_schema: Arc::clone(&shared.output_schema), + replace_schema, + arrow_reader_metrics, + predicate_cache_inner_records, + predicate_cache_records, + baseline_metrics: shared.baseline_metrics.clone(), + }, + |state| async move { state.transition().await }, + ) + .fuse(); + + // Attach `FilePruner` only to chunk 0 so the whole file scan can + // still early-stop when a dynamic filter narrows. + let boxed: BoxStream<'static, Result> = if chunk_idx == 0 + && let Some(pruner) = file_pruner + { + EarlyStoppingStream::new( + stream.boxed(), + pruner, + shared.file_metrics.files_ranges_pruned_statistics.clone(), + ) + .boxed() + } else { + stream.boxed() + }; + + Ok(boxed) } } @@ -515,9 +715,12 @@ impl MorselPlanner for ParquetMorselPlanner { ))) }))) } - ParquetOpenState::Ready(stream) => { - let morsels: Vec> = - vec![Box::new(ParquetStreamMorsel::new(stream))]; + ParquetOpenState::Ready(morsels) => { + if morsels.is_empty() { + // No row groups survived pruning, so there's nothing to + // feed the executor — terminate this file's planner. + return Ok(None); + } Ok(Some(MorselPlan::new().with_morsels(morsels))) } ParquetOpenState::Done => Ok(None), @@ -656,6 +859,8 @@ impl ParquetMorselizer { max_predicate_cache_size: self.max_predicate_cache_size, reverse_row_groups: self.reverse_row_groups, preserve_order: self.preserve_order, + morsel_max_rows: self.morsel_max_rows, + morsel_max_compressed_bytes: self.morsel_max_compressed_bytes, #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, }) @@ -1055,8 +1260,15 @@ impl BloomFiltersLoadedParquetOpen { } impl RowGroupsPrunedParquetOpen { - /// Build the final parquet stream once all pruning work is complete. - fn build_stream(self) -> Result>> { + /// Build one or more per-morsel streams once all pruning work is complete. + /// + /// Row groups are packed into chunks of up to [`MORSEL_MAX_ROWS`] rows and + /// [`MORSEL_MAX_COMPRESSED_BYTES`] compressed bytes. Each chunk becomes an + /// independent stream that can be wrapped in a `ParquetStreamMorsel`, + /// letting the driver interleave row-group work with other operators and + /// unblocking the follow-on work of sharing row-group-level work across + /// sibling `FileStream`s. + fn build_stream(self) -> Result>> { let RowGroupsPrunedParquetOpen { prepared, mut row_groups, @@ -1075,32 +1287,6 @@ impl RowGroupsPrunedParquetOpen { let file_metadata = Arc::clone(reader_metadata.metadata()); let rg_metadata = file_metadata.row_groups(); - // Filter pushdown: evaluate predicates during scan - let row_filter = if let Some(predicate) = prepared - .pushdown_filters - .then_some(prepared.predicate.clone()) - .flatten() - { - let row_filter = row_filter::build_row_filter( - &predicate, - &prepared.physical_file_schema, - file_metadata.as_ref(), - prepared.reorder_predicates, - &prepared.file_metrics, - ); - - match row_filter { - Ok(Some(filter)) => Some(filter), - Ok(None) => None, - Err(e) => { - debug!("Ignoring error building row filter for '{predicate:?}': {e}"); - None - } - } - } else { - None - }; - // Prune by limit if limit is set and limit order is not sensitive if let (Some(limit), false) = (prepared.limit, prepared.preserve_order) { row_groups.prune_by_limit(limit, rg_metadata, &prepared.file_metrics); @@ -1123,98 +1309,72 @@ impl RowGroupsPrunedParquetOpen { ); } - // Prepare the access plan (extract row groups and row selection) - let mut prepared_plan = access_plan.prepare(rg_metadata)?; - - // Potentially reverse the access plan for performance. - // See `ParquetSource::try_pushdown_sort` for the rationale. - if prepared.reverse_row_groups { - prepared_plan = prepared_plan.reverse(file_metadata.as_ref())?; + if access_plan.row_group_index_iter().next().is_none() { + return Ok(Vec::new()); } - let arrow_reader_metrics = ArrowReaderMetrics::enabled(); - let read_plan = build_projection_read_plan( - prepared.projection.expr_iter(), - &prepared.physical_file_schema, - reader_metadata.parquet_schema(), + let mut chunk_plans = access_plan.split_into_chunks( + rg_metadata, + prepared.morsel_max_rows, + prepared.morsel_max_compressed_bytes, ); - let mut decoder_builder = - ParquetPushDecoderBuilder::new_with_metadata(reader_metadata) - .with_projection(read_plan.projection_mask) - .with_batch_size(prepared.batch_size) - .with_metrics(arrow_reader_metrics.clone()); - - if let Some(row_filter) = row_filter { - decoder_builder = decoder_builder.with_row_filter(row_filter); - } - if prepared.force_filter_selections { - decoder_builder = - decoder_builder.with_row_selection_policy(RowSelectionPolicy::Selectors); - } - if let Some(row_selection) = prepared_plan.row_selection { - decoder_builder = decoder_builder.with_row_selection(row_selection); - } - decoder_builder = - decoder_builder.with_row_groups(prepared_plan.row_group_indexes); - if let Some(limit) = prepared.limit { - decoder_builder = decoder_builder.with_limit(limit); - } - if let Some(max_predicate_cache_size) = prepared.max_predicate_cache_size { - decoder_builder = - decoder_builder.with_max_predicate_cache_size(max_predicate_cache_size); + // Reverse chunk order so that, when `reverse_row_groups` is set, the + // first emitted morsel corresponds to the file's last row groups. + // Each chunk's `PreparedAccessPlan` is also reversed below so that + // within a chunk the row-group read order mirrors the file-wide + // reversal. See `ParquetSource::try_pushdown_sort` for the rationale. + if prepared.reverse_row_groups { + chunk_plans.reverse(); } - let decoder = decoder_builder.build()?; - - let predicate_cache_inner_records = - prepared.file_metrics.predicate_cache_inner_records.clone(); - let predicate_cache_records = - prepared.file_metrics.predicate_cache_records.clone(); - - // Check if we need to replace the schema to handle things like differing nullability or metadata. - // See note below about file vs. output schema. - let stream_schema = read_plan.projected_schema; - let replace_schema = stream_schema != prepared.output_schema; - - // Rebase column indices to match the narrowed stream schema. - // The projection expressions have indices based on physical_file_schema, - // but the stream only contains the columns selected by the ProjectionMask. - let projection = prepared - .projection - .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; - let projector = projection.make_projector(&stream_schema)?; - let output_schema = Arc::clone(&prepared.output_schema); - let files_ranges_pruned_statistics = - prepared.file_metrics.files_ranges_pruned_statistics.clone(); - let stream = futures::stream::unfold( - PushDecoderStreamState { - decoder, - reader: prepared.async_file_reader, - projector, - output_schema, - replace_schema, - arrow_reader_metrics, - predicate_cache_inner_records, - predicate_cache_records, - baseline_metrics: prepared.baseline_metrics, - }, - |state| async move { state.transition().await }, - ) - .fuse(); + // `prepared.async_file_reader` served metadata / page-index / + // bloom-filter loads and is dropped here: each morsel mints its + // own reader via the factory at `into_stream` time. Built-in + // factories wrap only `Arc` (HTTP/connection + // pool already shared) or an `Arc`, so the + // "warm cache" benefit of reusing a reader is negligible. + let mut file_pruner = prepared.file_pruner; + + let shared = Arc::new(LazyMorselShared { + partition_index: prepared.partition_index, + partitioned_file: prepared.partitioned_file, + metadata_size_hint: prepared.metadata_size_hint, + metrics: prepared.metrics, + file_metrics: prepared.file_metrics, + baseline_metrics: prepared.baseline_metrics, + parquet_file_reader_factory: prepared.parquet_file_reader_factory, + batch_size: prepared.batch_size, + physical_file_schema: prepared.physical_file_schema, + output_schema: prepared.output_schema, + projection: prepared.projection, + predicate: prepared.predicate, + pushdown_filters: prepared.pushdown_filters, + force_filter_selections: prepared.force_filter_selections, + reorder_predicates: prepared.reorder_predicates, + limit: prepared.limit, + max_predicate_cache_size: prepared.max_predicate_cache_size, + reverse_row_groups: prepared.reverse_row_groups, + reader_metadata, + file_metadata, + }); + + // `FilePruner` is `!Clone`, so `take` hands it to the first morsel + // and leaves `None` for the rest. + let morsels: Vec> = chunk_plans + .into_iter() + .enumerate() + .map(|(chunk_idx, chunk_plan)| { + Box::new(ParquetLazyMorsel { + shared: Arc::clone(&shared), + chunk_plan, + chunk_idx, + file_pruner: file_pruner.take(), + }) as Box + }) + .collect(); - // Wrap the stream so a dynamic filter can stop the file scan early. - if let Some(file_pruner) = prepared.file_pruner { - let stream = stream.boxed(); - Ok(EarlyStoppingStream::new( - stream, - file_pruner, - files_ranges_pruned_statistics, - ) - .boxed()) - } else { - Ok(stream.boxed()) - } + Ok(morsels) } } @@ -1629,8 +1789,7 @@ mod test { use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use bytes::{BufMut, BytesMut}; use datafusion_common::{ - ColumnStatistics, ScalarValue, Statistics, internal_err, record_batch, - stats::Precision, + ColumnStatistics, ScalarValue, Statistics, record_batch, stats::Precision, }; use datafusion_datasource::morsel::{Morsel, Morselizer}; use datafusion_datasource::{PartitionedFile, TableSchema}; @@ -1676,6 +1835,8 @@ mod test { max_predicate_cache_size: Option, reverse_row_groups: bool, preserve_order: bool, + morsel_max_rows: u64, + morsel_max_compressed_bytes: u64, } impl ParquetMorselizerBuilder { @@ -1702,6 +1863,8 @@ mod test { max_predicate_cache_size: None, reverse_row_groups: false, preserve_order: false, + morsel_max_rows: DEFAULT_MORSEL_MAX_ROWS, + morsel_max_compressed_bytes: DEFAULT_MORSEL_MAX_COMPRESSED_BYTES, } } @@ -1765,6 +1928,19 @@ mod test { self } + /// Override the per-morsel row budget. + fn with_morsel_max_rows(mut self, limit: u64) -> Self { + self.morsel_max_rows = limit; + self + } + + /// Override the per-morsel compressed byte budget. + #[expect(dead_code)] + fn with_morsel_max_compressed_bytes(mut self, limit: u64) -> Self { + self.morsel_max_compressed_bytes = limit; + self + } + /// Build the ParquetMorselizer instance. /// /// # Panics @@ -1816,6 +1992,8 @@ mod test { encryption_factory: None, max_predicate_cache_size: self.max_predicate_cache_size, reverse_row_groups: self.reverse_row_groups, + morsel_max_rows: self.morsel_max_rows, + morsel_max_compressed_bytes: self.morsel_max_compressed_bytes, } } } @@ -1830,32 +2008,49 @@ mod test { morselizer: &ParquetMorselizer, file: PartitionedFile, ) -> Result>> { - let mut planners = VecDeque::from([morselizer.plan_file(file)?]); - let mut morsels: VecDeque> = VecDeque::new(); + let morsels = collect_all_morsels(morselizer, file).await?; + if let Some(first) = morsels.into_iter().next() { + Ok(Box::pin(first.into_stream())) + } else { + Ok(Box::pin(futures::stream::empty())) + } + } - loop { - if let Some(morsel) = morsels.pop_front() { - return Ok(Box::pin(morsel.into_stream())); - } + /// Drives the morselizer to completion and returns every morsel it + /// produced, in order. Useful for asserting how a file is split into + /// row-group morsels. + async fn collect_all_morsels( + morselizer: &ParquetMorselizer, + file: PartitionedFile, + ) -> Result>> { + let mut planners = VecDeque::from([morselizer.plan_file(file)?]); + let mut morsels: Vec> = Vec::new(); - let Some(planner) = planners.pop_front() else { - return Ok(Box::pin(futures::stream::empty())); + while let Some(planner) = planners.pop_front() { + let Some(mut plan) = planner.plan()? else { + continue; }; + morsels.extend(plan.take_morsels()); + planners.extend(plan.take_ready_planners()); - if let Some(mut plan) = planner.plan()? { - morsels.extend(plan.take_morsels()); - planners.extend(plan.take_ready_planners()); + if let Some(pending_planner) = plan.take_pending_planner() { + planners.push_front(pending_planner.await?); + } + } - if let Some(pending_planner) = plan.take_pending_planner() { - planners.push_front(pending_planner.await?); - continue; - } + Ok(morsels) + } - if morsels.is_empty() && planners.is_empty() { - return internal_err!("planner returned an empty morsel plan"); - } - } + /// Concatenate all batches produced by `streams`, returning the int32 + /// values from the first column of each batch. + async fn collect_int32_values_across( + streams: Vec>>, + ) -> Vec { + let mut values = vec![]; + for stream in streams { + values.extend(collect_int32_values(stream).await); } + values } fn constant_int_stats() -> (Statistics, SchemaRef) { @@ -2651,6 +2846,216 @@ mod test { ); } + /// A multi-row-group file whose pruned access plan exceeds the per-morsel + /// row budget produces multiple morsels, and their concatenated output + /// matches the single-morsel reference. + #[tokio::test] + async fn test_row_group_split_produces_multiple_morsels() { + use parquet::file::properties::WriterProperties; + + let store = Arc::new(InMemory::new()) as Arc; + + // Three row groups of 3 rows each. Packing stops at 3 rows/morsel, so + // we expect three morsels. + let batch1 = + record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let batch2 = + record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap(); + let batch3 = + record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap(); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(3)) + .build(); + let data_len = write_parquet_batches( + Arc::clone(&store), + "test.parquet", + vec![batch1.clone(), batch2, batch3], + Some(props), + ) + .await; + let schema = batch1.schema(); + let file = PartitionedFile::new( + "test.parquet".to_string(), + u64::try_from(data_len).unwrap(), + ); + + let morselizer = ParquetMorselizerBuilder::new() + .with_store(Arc::clone(&store)) + .with_schema(Arc::clone(&schema)) + .with_projection_indices(&[0]) + .with_morsel_max_rows(3) + .build(); + let morsels = collect_all_morsels(&morselizer, file.clone()) + .await + .unwrap(); + assert_eq!(morsels.len(), 3, "one morsel per row group"); + + let streams = morsels + .into_iter() + .map(|m| Box::pin(m.into_stream()) as BoxStream<_>) + .collect(); + let values = collect_int32_values_across(streams).await; + assert_eq!(values, vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); + + // Reference: default budget keeps everything in one morsel. + let reference_morselizer = ParquetMorselizerBuilder::new() + .with_store(Arc::clone(&store)) + .with_schema(schema) + .with_projection_indices(&[0]) + .build(); + let reference_stream = open_file(&reference_morselizer, file).await.unwrap(); + assert_eq!( + collect_int32_values(reference_stream).await, + vec![1, 2, 3, 4, 5, 6, 7, 8, 9] + ); + } + + /// When adjacent row groups fit inside the morsel budget they should be + /// packed together rather than emitted one-per-morsel. + #[tokio::test] + async fn test_row_group_split_packs_within_budget() { + use parquet::file::properties::WriterProperties; + + let store = Arc::new(InMemory::new()) as Arc; + + let batch1 = + record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let batch2 = + record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap(); + let batch3 = + record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap(); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(3)) + .build(); + let data_len = write_parquet_batches( + Arc::clone(&store), + "test.parquet", + vec![batch1.clone(), batch2, batch3], + Some(props), + ) + .await; + let schema = batch1.schema(); + let file = PartitionedFile::new( + "test.parquet".to_string(), + u64::try_from(data_len).unwrap(), + ); + + // Budget fits exactly 2 row groups; expect two morsels: [0+1], [2]. + let morselizer = ParquetMorselizerBuilder::new() + .with_store(store) + .with_schema(schema) + .with_projection_indices(&[0]) + .with_morsel_max_rows(6) + .build(); + let morsels = collect_all_morsels(&morselizer, file).await.unwrap(); + assert_eq!(morsels.len(), 2); + } + + /// A user-supplied access plan with a `Skip` entry between scanned row + /// groups should preserve the skip across chunking. + #[tokio::test] + async fn test_row_group_split_honors_user_skip() { + use crate::ParquetAccessPlan; + use parquet::file::properties::WriterProperties; + + let store = Arc::new(InMemory::new()) as Arc; + + let batch1 = + record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let batch2 = + record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap(); + let batch3 = + record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap(); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(3)) + .build(); + let data_len = write_parquet_batches( + Arc::clone(&store), + "test.parquet", + vec![batch1.clone(), batch2, batch3], + Some(props), + ) + .await; + let schema = batch1.schema(); + + let mut access_plan = ParquetAccessPlan::new_all(3); + access_plan.skip(1); + + let file = PartitionedFile::new( + "test.parquet".to_string(), + u64::try_from(data_len).unwrap(), + ) + .with_extensions(Arc::new(access_plan)); + + let morselizer = ParquetMorselizerBuilder::new() + .with_store(store) + .with_schema(schema) + .with_projection_indices(&[0]) + .with_morsel_max_rows(3) + .build(); + let morsels = collect_all_morsels(&morselizer, file).await.unwrap(); + let streams = morsels + .into_iter() + .map(|m| Box::pin(m.into_stream()) as BoxStream<_>) + .collect(); + let values = collect_int32_values_across(streams).await; + assert_eq!(values, vec![1, 2, 3, 7, 8, 9], "row group 1 is skipped"); + } + + /// When `reverse_row_groups` is set the per-morsel split should preserve + /// the reverse output order: the first morsel emits the file's last row + /// group. + #[tokio::test] + async fn test_row_group_split_with_reverse() { + use parquet::file::properties::WriterProperties; + + let store = Arc::new(InMemory::new()) as Arc; + + let batch1 = + record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let batch2 = + record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap(); + let batch3 = + record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap(); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(3)) + .build(); + let data_len = write_parquet_batches( + Arc::clone(&store), + "test.parquet", + vec![batch1.clone(), batch2, batch3], + Some(props), + ) + .await; + let schema = batch1.schema(); + let file = PartitionedFile::new( + "test.parquet".to_string(), + u64::try_from(data_len).unwrap(), + ); + + let morselizer = ParquetMorselizerBuilder::new() + .with_store(store) + .with_schema(schema) + .with_projection_indices(&[0]) + .with_morsel_max_rows(3) + .with_reverse_row_groups(true) + .build(); + let morsels = collect_all_morsels(&morselizer, file).await.unwrap(); + assert_eq!(morsels.len(), 3); + + // First morsel should emit the originally-last row group. + let streams: Vec<_> = morsels + .into_iter() + .map(|m| Box::pin(m.into_stream()) as BoxStream<_>) + .collect(); + let values = collect_int32_values_across(streams).await; + assert_eq!(values, vec![7, 8, 9, 4, 5, 6, 1, 2, 3]); + } + /// Test that page pruning predicates are only built and applied when `enable_page_index` is true. /// /// The file has a single row group with 10 pages (10 rows each, values 1..100). diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index a014c8b2726e7..0d0840655bf26 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -580,6 +580,9 @@ impl FileSource for ParquetSource { encryption_factory: self.get_encryption_factory_with_config(), max_predicate_cache_size: self.max_predicate_cache_size(), reverse_row_groups: self.reverse_row_groups, + morsel_max_rows: crate::opener::DEFAULT_MORSEL_MAX_ROWS, + morsel_max_compressed_bytes: + crate::opener::DEFAULT_MORSEL_MAX_COMPRESSED_BYTES, })) } diff --git a/datafusion/execution/src/memory_pool/mod.rs b/datafusion/execution/src/memory_pool/mod.rs index a544cdfdb02e8..829e313d2381e 100644 --- a/datafusion/execution/src/memory_pool/mod.rs +++ b/datafusion/execution/src/memory_pool/mod.rs @@ -19,6 +19,7 @@ //! help with allocation accounting. use datafusion_common::{Result, internal_datafusion_err}; +use std::fmt::Display; use std::hash::{Hash, Hasher}; use std::{cmp::Ordering, sync::Arc, sync::atomic}; @@ -181,7 +182,10 @@ pub use pool::*; /// /// * [`TrackConsumersPool`]: Wraps another [`MemoryPool`] and tracks consumers, /// providing better error messages on the largest memory users. -pub trait MemoryPool: Send + Sync + std::fmt::Debug { +pub trait MemoryPool: Send + Sync + std::fmt::Debug + Display { + /// Return pool name + fn name(&self) -> &str; + /// Registers a new [`MemoryConsumer`] /// /// Note: Subsequent calls to [`Self::grow`] must be made to reserve memory @@ -232,7 +236,7 @@ pub enum MemoryLimit { /// [`MemoryReservation`] in a [`MemoryPool`]. All allocations are registered to /// a particular `MemoryConsumer`; /// -/// Each `MemoryConsumer` is identifiable by a process-unique id, and is therefor not cloneable, +/// Each `MemoryConsumer` is identifiable by a process-unique id, and is therefore not cloneable, /// If you want a clone of a `MemoryConsumer`, you should look into [`MemoryConsumer::clone_with_new_id`], /// but note that this `MemoryConsumer` may be treated as a separate entity based on the used pool, /// and is only guaranteed to share the name and inner properties. diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index 19aaa0371ada3..aac95b9d6a81f 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -22,6 +22,7 @@ use datafusion_common::HashMap; use datafusion_common::{DataFusionError, Result, resources_datafusion_err}; use log::debug; use parking_lot::Mutex; +use std::fmt::{Display, Formatter}; use std::{ num::NonZeroUsize, sync::atomic::{AtomicUsize, Ordering}, @@ -34,6 +35,10 @@ pub struct UnboundedMemoryPool { } impl MemoryPool for UnboundedMemoryPool { + fn name(&self) -> &str { + "unbounded" + } + fn grow(&self, _reservation: &MemoryReservation, additional: usize) { self.used.fetch_add(additional, Ordering::Relaxed); } @@ -56,6 +61,13 @@ impl MemoryPool for UnboundedMemoryPool { } } +impl Display for UnboundedMemoryPool { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let used = self.used.load(Ordering::Relaxed); + write!(f, "{}(used: {})", &self.name(), human_readable_size(used)) + } +} + /// A [`MemoryPool`] that implements a greedy first-come first-serve limit. /// /// This pool works well for queries that do not need to spill or have @@ -79,6 +91,10 @@ impl GreedyMemoryPool { } impl MemoryPool for GreedyMemoryPool { + fn name(&self) -> &str { + "greedy" + } + fn grow(&self, _reservation: &MemoryReservation, additional: usize) { self.used.fetch_add(additional, Ordering::Relaxed); } @@ -98,6 +114,7 @@ impl MemoryPool for GreedyMemoryPool { reservation, additional, self.pool_size.saturating_sub(used), + self, ) })?; Ok(()) @@ -112,6 +129,19 @@ impl MemoryPool for GreedyMemoryPool { } } +impl Display for GreedyMemoryPool { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let used = self.used.load(Ordering::Relaxed); + write!( + f, + "{}(used: {}, pool_size: {})", + &self.name(), + human_readable_size(used), + human_readable_size(self.pool_size) + ) + } +} + /// A [`MemoryPool`] that prevents spillable reservations from using more than /// an even fraction of the available memory sans any unspillable reservations /// (i.e. `(pool_size - unspillable_memory) / num_spillable_reservations`) @@ -170,6 +200,10 @@ impl FairSpillPool { } impl MemoryPool for FairSpillPool { + fn name(&self) -> &str { + "fair" + } + fn register(&self, consumer: &MemoryConsumer) { if consumer.can_spill { self.state.lock().num_spill += 1; @@ -217,6 +251,7 @@ impl MemoryPool for FairSpillPool { reservation, additional, available, + self, )); } state.spillable += additional; @@ -231,6 +266,7 @@ impl MemoryPool for FairSpillPool { reservation, additional, available, + self, )); } state.unspillable += additional; @@ -249,6 +285,17 @@ impl MemoryPool for FairSpillPool { } } +impl Display for FairSpillPool { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}(pool_size: {})", + &self.name(), + human_readable_size(self.pool_size), + ) + } +} + /// Constructs a resources error based upon the individual [`MemoryReservation`]. /// /// The error references the `bytes already allocated` for the reservation, @@ -259,13 +306,15 @@ fn insufficient_capacity_err( reservation: &MemoryReservation, additional: usize, available: usize, + pool: &impl MemoryPool, ) -> DataFusionError { resources_datafusion_err!( - "Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total pool", + "Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total memory pool: {}", human_readable_size(additional), reservation.registration.consumer.name, human_readable_size(reservation.size()), - human_readable_size(available) + human_readable_size(available), + pool ) } @@ -362,6 +411,18 @@ pub struct TrackConsumersPool { tracked_consumers: Mutex>, } +impl Display for TrackConsumersPool { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}(inner_pool: {}, num_of_top_consumers: {})", + &self.name(), + &self.inner, + &self.top, + ) + } +} + impl TrackConsumersPool { /// Creates a new [`TrackConsumersPool`]. /// @@ -407,6 +468,11 @@ impl TrackConsumersPool { } } + /// Returns a reference to the wrapped inner [`MemoryPool`]. + pub fn inner(&self) -> &I { + &self.inner + } + /// Returns a snapshot of all currently tracked consumers. pub fn metrics(&self) -> Vec { self.tracked_consumers @@ -452,6 +518,10 @@ impl TrackConsumersPool { } impl MemoryPool for TrackConsumersPool { + fn name(&self) -> &str { + "track_consumers" + } + fn register(&self, consumer: &MemoryConsumer) { self.inner.register(consumer); @@ -545,7 +615,7 @@ fn provide_top_memory_consumers_to_error_msg( #[cfg(test)] mod tests { use super::*; - use insta::{Settings, allow_duplicates, assert_snapshot}; + use insta::{Settings, allow_duplicates, assert_snapshot, with_settings}; use std::sync::Arc; fn make_settings() -> Settings { @@ -575,10 +645,10 @@ mod tests { assert_eq!(pool.reserved(), 4000); let err = r2.try_grow(1).unwrap_err().strip_backtrace(); - assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total pool"); + assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)"); let err = r2.try_grow(1).unwrap_err().strip_backtrace(); - assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total pool"); + assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)"); r1.shrink(1990); r2.shrink(2000); @@ -603,12 +673,12 @@ mod tests { .register(&pool); let err = r3.try_grow(70).unwrap_err().strip_backtrace(); - assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total pool"); + assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)"); //Shrinking r2 to zero doesn't allow a3 to allocate more than 45 r2.free(); let err = r3.try_grow(70).unwrap_err().strip_backtrace(); - assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total pool"); + assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)"); // But dropping r2 does drop(r2); @@ -621,7 +691,7 @@ mod tests { let r4 = MemoryConsumer::new("s4").register(&pool); let err = r4.try_grow(30).unwrap_err().strip_backtrace(); - assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 30.0 B for s4 with 0.0 B already allocated for this reservation - 20.0 B remain available for the total pool"); + assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 30.0 B for s4 with 0.0 B already allocated for this reservation - 20.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)"); } #[test] @@ -669,7 +739,7 @@ mod tests { r1#[ID](can spill: false) consumed 50.0 B, peak 70.0 B, r3#[ID](can spill: false) consumed 20.0 B, peak 25.0 B, r2#[ID](can spill: false) consumed 15.0 B, peak 15.0 B. - Error: Failed to allocate additional 150.0 B for r5 with 0.0 B already allocated for this reservation - 5.0 B remain available for the total pool + Error: Failed to allocate additional 150.0 B for r5 with 0.0 B already allocated for this reservation - 5.0 B remain available for the total memory pool: greedy(used: 95.0 B, pool_size: 100.0 B) "); } @@ -692,7 +762,7 @@ mod tests { assert_snapshot!(error, @r" Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as: foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B. - Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 100.0 B remain available for the total pool + Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 100.0 B remain available for the total memory pool: greedy(used: 0.0 B, pool_size: 100.0 B) "); // API: multiple registrations using the same hashed consumer, @@ -710,7 +780,7 @@ mod tests { Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as: foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B, foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B. - Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 90.0 B remain available for the total pool + Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: greedy(used: 10.0 B, pool_size: 100.0 B) "); // Test: will accumulate size changes per consumer, not per reservation @@ -723,7 +793,7 @@ mod tests { Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as: foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B, foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. - Error: Failed to allocate additional 150.0 B for foo with 20.0 B already allocated for this reservation - 70.0 B remain available for the total pool + Error: Failed to allocate additional 150.0 B for foo with 20.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: greedy(used: 30.0 B, pool_size: 100.0 B) "); // Test: different hashed consumer, (even with the same name), @@ -739,78 +809,86 @@ mod tests { foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B, foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B, foo#[ID](can spill: true) consumed 0.0 B, peak 0.0 B. - Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 70.0 B remain available for the total pool + Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: greedy(used: 30.0 B, pool_size: 100.0 B) "); } #[test] fn test_tracked_consumers_pool_deregister() { - fn test_per_pool_type(pool: Arc) { - // Baseline: see the 2 memory consumers - let setting = make_settings(); - let _bound = setting.bind_to_scope(); - let r0 = MemoryConsumer::new("r0").register(&pool); - r0.grow(10); - let r1_consumer = MemoryConsumer::new("r1"); - let r1 = r1_consumer.register(&pool); - r1.grow(20); - - let res = r0.try_grow(150); - assert!(res.is_err()); - let error = res.unwrap_err().strip_backtrace(); - allow_duplicates!(assert_snapshot!(error, @r" - Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: - r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B, - r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. - Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total pool - ")); - - // Test: unregister one - // only the remaining one should be listed - drop(r1); - let res = r0.try_grow(150); - assert!(res.is_err()); - let error = res.unwrap_err().strip_backtrace(); - allow_duplicates!(assert_snapshot!(error, @r" - Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: - r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. - Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool - ")); - - // Test: actual message we see is the `available is 70`. When it should be `available is 90`. - // This is because the pool.shrink() does not automatically occur within the inner_pool.deregister(). - let res = r0.try_grow(150); - assert!(res.is_err()); - let error = res.unwrap_err().strip_backtrace(); - allow_duplicates!(assert_snapshot!(error, @r" - Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: - r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. - Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool - ")); - - // Test: the registration needs to free itself (or be dropped), - // for the proper error message - let res = r0.try_grow(150); - assert!(res.is_err()); - let error = res.unwrap_err().strip_backtrace(); - allow_duplicates!(assert_snapshot!(error, @r" - Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: - r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. - Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool - ")); + fn test_per_pool_type(pool: Arc>) { + // `snapshot_suffix` ties each insta snapshot to this pool's inner backend; filters + // normalize inner pool `Display` so fair vs greedy share the same `@` reference text. + with_settings!({ + snapshot_suffix => pool.inner().name().to_string(), + filters => vec![ + ( + r"([^\s]+)\#\d+\(can spill: (true|false)\)", + "$1#[ID](can spill: $2)", + ), + ( + r"for the total memory pool: [^\n]+", + "for the total memory pool: [INNER_POOL]", + ), + ], + }, { + let memory_pool: Arc = Arc::>::clone(&pool); + let r0 = MemoryConsumer::new("r0").register(&memory_pool); + r0.grow(10); + let r1 = MemoryConsumer::new("r1").register(&memory_pool); + r1.grow(20); + + // Baseline: see the 2 memory consumers + let error = r0.try_grow(150).unwrap_err().strip_backtrace(); + assert_snapshot!(error, @r" + Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: + r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B, + r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. + Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: [INNER_POOL] + "); + + // Test: unregister one — only the remaining consumer should be listed + drop(r1); + let error = r0.try_grow(150).unwrap_err().strip_backtrace(); + assert_snapshot!(error, @r" + Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: + r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. + Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL] + "); + + // Test: actual message we see is the `available is 70`. When it should be `available is 90`. + // This is because the pool.shrink() does not automatically occur within the inner_pool.deregister(). + let error = r0.try_grow(150).unwrap_err().strip_backtrace(); + assert_snapshot!(error, @r" + Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: + r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. + Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL] + "); + + // Test: the registration needs to free itself (or be dropped), + // for the proper error message + let error = r0.try_grow(150).unwrap_err().strip_backtrace(); + assert_snapshot!(error, @r" + Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as: + r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B. + Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL] + "); + } + ); } - let tracked_spill_pool: Arc = Arc::new(TrackConsumersPool::new( - FairSpillPool::new(100), - NonZeroUsize::new(3).unwrap(), - )); - test_per_pool_type(tracked_spill_pool); + allow_duplicates! { + let tracked_spill_pool = Arc::new(TrackConsumersPool::new( + FairSpillPool::new(100), + NonZeroUsize::new(3).unwrap(), + )); + test_per_pool_type(tracked_spill_pool); - let tracked_greedy_pool: Arc = Arc::new(TrackConsumersPool::new( - GreedyMemoryPool::new(100), - NonZeroUsize::new(3).unwrap(), - )); - test_per_pool_type(tracked_greedy_pool); + let tracked_greedy_pool = Arc::new(TrackConsumersPool::new( + GreedyMemoryPool::new(100), + NonZeroUsize::new(3).unwrap(), + )); + test_per_pool_type(tracked_greedy_pool); + } } #[test] @@ -894,4 +972,78 @@ mod tests { r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B. "); } + + #[test] + fn test_memory_pool_display_fmt() { + let top = NonZeroUsize::new(5).unwrap(); + + // UnboundedMemoryPool Display with default allocation: 0.0B + let unbounded = UnboundedMemoryPool::default(); + assert_eq!( + unbounded.to_string(), + "unbounded(used: 0.0 B)", + "UnboundedMemoryPool Display" + ); + + // UnboundedMemoryPool Display with reservations + let unbounded_arc: Arc = Arc::new(UnboundedMemoryPool::default()); + let r = MemoryConsumer::new("u").register(&unbounded_arc); + r.grow(2048); + assert_eq!( + unbounded_arc.as_ref().to_string(), + "unbounded(used: 2.0 KB)", + "UnboundedMemoryPool Display with reservations" + ); + + // GreedyMemoryPool Display with default allocation: 100.0B + let greedy = GreedyMemoryPool::new(100); + assert_eq!( + greedy.to_string(), + "greedy(used: 0.0 B, pool_size: 100.0 B)", + "GreedyMemoryPool Display" + ); + + // GreedyMemoryPool Display with reservations + let greedy_arc: Arc = Arc::new(GreedyMemoryPool::new(100)); + let r = MemoryConsumer::new("g").register(&greedy_arc); + r.grow(50); + assert_eq!( + greedy_arc.as_ref().to_string(), + "greedy(used: 50.0 B, pool_size: 100.0 B)", + "GreedyMemoryPool Display with reservations" + ); + + // FairSpillPool Display with default allocation: 4.0KB and without reservations + let fair = FairSpillPool::new(4096); + assert_eq!( + fair.to_string(), + "fair(pool_size: 4.0 KB)", + "FairSpillPool Display" + ); + + // TrackConsumersPool Display with default allocation: 128.0B and without reservations + let tracked_greedy = TrackConsumersPool::new(GreedyMemoryPool::new(128), top); + assert_eq!( + tracked_greedy.to_string(), + "track_consumers(inner_pool: greedy(used: 0.0 B, pool_size: 128.0 B), num_of_top_consumers: 5)", + "TrackConsumersPool Display" + ); + + // TrackConsumersPool Display with default allocation: 256.0B and without reservations + let tracked_fair = TrackConsumersPool::new(FairSpillPool::new(256), top); + assert_eq!( + tracked_fair.to_string(), + "track_consumers(inner_pool: fair(pool_size: 256.0 B), num_of_top_consumers: 5)", + "TrackConsumersPool Display" + ); + + // TrackConsumersPool Display without reservations + let tracked_unbounded = + TrackConsumersPool::new(UnboundedMemoryPool::default(), top); + assert_eq!( + tracked_unbounded.to_string(), + "track_consumers(inner_pool: unbounded(used: 0.0 B), num_of_top_consumers: 5)", + "TrackConsumersPool Display" + ); + } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 4f73169ad2827..d86024295a061 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -294,9 +294,12 @@ pub enum LogicalPlan { impl Default for LogicalPlan { fn default() -> Self { + // `Default` is used as a transient placeholder on hot paths (e.g. + // `Box`/`Arc` `map_elements`), so use a shared empty schema to avoid + // allocating. LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::empty()), + schema: Arc::clone(DFSchema::empty_ref()), }) } } diff --git a/datafusion/expr/src/logical_plan/statement.rs b/datafusion/expr/src/logical_plan/statement.rs index 384d99ca0899e..daf29d7c81d3f 100644 --- a/datafusion/expr/src/logical_plan/statement.rs +++ b/datafusion/expr/src/logical_plan/statement.rs @@ -20,7 +20,7 @@ use datafusion_common::metadata::format_type_and_metadata; use datafusion_common::{DFSchema, DFSchemaRef}; use itertools::Itertools as _; use std::fmt::{self, Display}; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; use crate::{Expr, LogicalPlan, expr_vec_fmt}; @@ -55,10 +55,7 @@ impl Statement { /// Get a reference to the logical plan's schema pub fn schema(&self) -> &DFSchemaRef { // Statements have an unchanging empty schema. - static STATEMENT_EMPTY_SCHEMA: LazyLock = - LazyLock::new(|| Arc::new(DFSchema::empty())); - - &STATEMENT_EMPTY_SCHEMA + DFSchema::empty_ref() } /// Return a descriptive string describing the type of this diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs index f3bec6bbf9954..f43b138a284ea 100644 --- a/datafusion/expr/src/tree_node.rs +++ b/datafusion/expr/src/tree_node.rs @@ -116,7 +116,7 @@ impl TreeNode for Expr { /// indicating whether the expression was transformed or left unchanged. fn map_children Result>>( self, - mut f: F, + f: F, ) -> Result> { Ok(match self { // TODO: remove the next line after `Expr::Wildcard` is removed @@ -150,8 +150,13 @@ impl TreeNode for Expr { relation, name, metadata, - }) => f(*expr)?.update_data(|e| { - e.alias_qualified_with_metadata(relation, name, metadata) + }) => expr.map_elements(f)?.update_data(|expr| { + Expr::Alias(Alias { + expr, + relation, + name, + metadata, + }) }), Expr::InSubquery(InSubquery { expr, diff --git a/datafusion/functions-nested/src/concat.rs b/datafusion/functions-nested/src/concat.rs index c3dc4c67cf12c..8d06140889a55 100644 --- a/datafusion/functions-nested/src/concat.rs +++ b/datafusion/functions-nested/src/concat.rs @@ -317,10 +317,23 @@ impl ScalarUDFImpl for ArrayConcat { } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - let base_type = base_type(&self.return_type(arg_types)?); + let return_type = self.return_type(arg_types)?; + let base_type = base_type(&return_type); let coercion = Some(&ListCoercion::FixedSizedListToList); + // When the return type is a `LargeList`, the outer container of every + // input must be widened to `LargeList` as well. Otherwise + // `array_concat_inner` would later try to downcast a `List` argument + // to `GenericListArray` and fail. + let promote_to_large_list = matches!(return_type, DataType::LargeList(_)); let arg_types = arg_types.iter().map(|arg_type| { - coerced_type_with_base_type_only(arg_type, &base_type, coercion) + let coerced = + coerced_type_with_base_type_only(arg_type, &base_type, coercion); + match coerced { + DataType::List(field) if promote_to_large_list => { + DataType::LargeList(field) + } + other => other, + } }); Ok(arg_types.collect()) diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index a5ac8901635b7..90cab7246d71c 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -531,35 +531,48 @@ fn estimate_join_cardinality( }) } - // For SemiJoins estimation result is either zero, in cases when inputs - // are non-overlapping according to statistics, or equal to number of rows - // for outer input - JoinType::LeftSemi | JoinType::RightSemi => { - let (outer_stats, inner_stats) = match join_type { - JoinType::LeftSemi => (left_stats, right_stats), - _ => (right_stats, left_stats), - }; - let cardinality = match estimate_disjoint_inputs(&outer_stats, &inner_stats) { - Some(estimation) => *estimation.get_value()?, - None => *outer_stats.num_rows.get_value()?, - }; + JoinType::LeftSemi + | JoinType::RightSemi + | JoinType::LeftAnti + | JoinType::RightAnti => { + let is_left = matches!(join_type, JoinType::LeftSemi | JoinType::LeftAnti); + let is_anti = matches!(join_type, JoinType::LeftAnti | JoinType::RightAnti); + + let ((outer_stats, inner_stats), (outer_col_stats, inner_col_stats)) = + if is_left { + ( + (&left_stats, &right_stats), + (&left_col_stats, &right_col_stats), + ) + } else { + ( + (&right_stats, &left_stats), + (&right_col_stats, &left_col_stats), + ) + }; - Some(PartialJoinStatistics { - num_rows: cardinality, - column_statistics: outer_stats.column_statistics, - }) - } + let outer_rows = *outer_stats.num_rows.get_value()?; - // For AntiJoins estimation always equals to outer statistics, as - // non-overlapping inputs won't affect estimation - JoinType::LeftAnti | JoinType::RightAnti => { - let outer_stats = match join_type { - JoinType::LeftAnti => left_stats, - _ => right_stats, - }; + let cardinality = + if estimate_disjoint_inputs(outer_stats, inner_stats).is_some() { + // Disjoint inputs: semi produces 0, anti keeps all rows. + if is_anti { outer_rows } else { 0 } + } else { + match estimate_semi_join_cardinality( + &outer_stats.num_rows, + &inner_stats.num_rows, + outer_col_stats, + inner_col_stats, + ) { + Some(semi) if is_anti => outer_rows.saturating_sub(semi), + Some(semi) => semi, + None => outer_rows, + } + }; + let outer_stats = if is_left { left_stats } else { right_stats }; Some(PartialJoinStatistics { - num_rows: *outer_stats.num_rows.get_value()?, + num_rows: cardinality, column_statistics: outer_stats.column_statistics, }) } @@ -699,6 +712,95 @@ fn estimate_disjoint_inputs( None } +/// Estimates the number of outer rows that have at least one matching +/// key on the inner side (i.e. semi join cardinality) using NDV +/// (Number of Distinct Values) statistics. +/// +/// Assuming the smaller domain is contained in the larger, the number +/// of overlapping distinct values is `min(outer_ndv, inner_ndv)`. +/// Under the uniformity assumption (each distinct value contributes +/// equally to row counts), the surviving fraction of outer rows is: +/// +/// Null rows cannot match, so each column's selectivity is further +/// reduced by the outer null fraction: +/// +/// ```text +/// null_frac_i = outer_null_count_i / outer_rows +/// selectivity_i = min(outer_ndv_i, inner_ndv_i) / outer_ndv_i * (1 - null_frac_i) +/// ``` +/// +/// For multi-column join keys the overall selectivity is the product +/// of per-column factors: +/// +/// ```text +/// semi_cardinality = outer_rows * product_i(selectivity_i) +/// ``` +/// +/// Anti join cardinality is derived as the complement: +/// `outer_rows - semi_cardinality`. +/// +/// Boundary cases: +/// * `inner_ndv >= outer_ndv` → selectivity = `1.0 - null_frac` +/// * `null_frac = 1.0` → selectivity = 0.0 (no non-null rows can match) +/// * Missing NDV statistics → returns `None` (fallback to `outer_rows`) +/// +/// PostgreSQL uses a similar approach in `eqjoinsel_semi` +/// (`src/backend/utils/adt/selfuncs.c`). When NDV statistics are +/// available on both sides it computes selectivity as `nd2 / nd1`, +/// which is equivalent to `min(outer_ndv, inner_ndv) / outer_ndv`. +/// If either side lacks statistics it falls back to a default. +fn estimate_semi_join_cardinality( + outer_num_rows: &Precision, + inner_num_rows: &Precision, + outer_col_stats: &[ColumnStatistics], + inner_col_stats: &[ColumnStatistics], +) -> Option { + let outer_rows = *outer_num_rows.get_value()?; + if outer_rows == 0 { + return Some(0); + } + let inner_rows = *inner_num_rows.get_value()?; + if inner_rows == 0 { + return Some(0); + } + + let mut selectivity = 1.0_f64; + let mut has_selectivity_estimate = false; + + for (outer_stat, inner_stat) in outer_col_stats.iter().zip(inner_col_stats.iter()) { + let outer_has_stats = outer_stat.distinct_count.get_value().is_some() + || (outer_stat.min_value.get_value().is_some() + && outer_stat.max_value.get_value().is_some()); + let inner_has_stats = inner_stat.distinct_count.get_value().is_some() + || (inner_stat.min_value.get_value().is_some() + && inner_stat.max_value.get_value().is_some()); + if !outer_has_stats || !inner_has_stats { + continue; + } + + let outer_ndv = max_distinct_count(outer_num_rows, outer_stat); + let inner_ndv = max_distinct_count(inner_num_rows, inner_stat); + + if let (Some(&o), Some(&i)) = (outer_ndv.get_value(), inner_ndv.get_value()) + && o > 0 + { + let null_frac = outer_stat + .null_count + .get_value() + .map(|&nc| nc as f64 / outer_rows as f64) + .unwrap_or(0.0); + selectivity *= (o.min(i) as f64) / (o as f64) * (1.0 - null_frac); + has_selectivity_estimate = true; + } + } + + if has_selectivity_estimate { + Some((outer_rows as f64 * selectivity).ceil() as usize) + } else { + None + } +} + /// Estimate the number of maximum distinct values that can be present in the /// given column from its statistics. If distinct_count is available, uses it /// directly. Otherwise, if the column is numeric and has min/max values, it @@ -2697,7 +2799,7 @@ mod tests { JoinType::LeftSemi, (50, Inexact(10), Inexact(20), Absent, Absent), (10, Inexact(15), Inexact(25), Absent, Absent), - Some(50), + Some(46), ), ( JoinType::RightSemi, @@ -2733,13 +2835,13 @@ mod tests { JoinType::LeftAnti, (50, Inexact(10), Inexact(20), Absent, Absent), (10, Inexact(15), Inexact(25), Absent, Absent), - Some(50), + Some(4), ), ( JoinType::RightAnti, (50, Inexact(10), Inexact(20), Absent, Absent), (10, Inexact(15), Inexact(25), Absent, Absent), - Some(10), + Some(0), ), ( JoinType::LeftAnti, @@ -2765,6 +2867,108 @@ mod tests { (10, Inexact(30), Absent, Absent, Absent), Some(50), ), + // NDV-based semi join: outer_ndv=20, inner_ndv=10 + // selectivity = 10/20 = 0.5, cardinality = ceil(50 * 0.5) = 25 + ( + JoinType::LeftSemi, + (50, Inexact(1), Inexact(100), Inexact(20), Absent), + (10, Inexact(1), Inexact(100), Inexact(10), Absent), + Some(25), + ), + // inner_ndv(30) >= outer_ndv(20) -> selectivity 1.0, no reduction + ( + JoinType::LeftSemi, + (50, Inexact(1), Inexact(100), Inexact(20), Absent), + (100, Inexact(1), Inexact(100), Inexact(30), Absent), + Some(50), + ), + // NDV-based anti join: semi=25, anti = 50 - 25 = 25 + ( + JoinType::LeftAnti, + (50, Inexact(1), Inexact(100), Inexact(20), Absent), + (10, Inexact(1), Inexact(100), Inexact(10), Absent), + Some(25), + ), + // inner covers all outer: semi=50, anti = 0 + ( + JoinType::LeftAnti, + (50, Inexact(1), Inexact(100), Inexact(20), Absent), + (100, Inexact(1), Inexact(100), Inexact(30), Absent), + Some(0), + ), + // RightSemi with explicit NDV (NDV within row count, used as-is): + // For RightSemi, sides are swapped: outer = right (20 rows, ndv=10), + // inner = left (50 rows, ndv=5). selectivity = min(10,5)/10 = 0.5, + // cardinality = ceil(20 * 0.5) = 10. + ( + JoinType::RightSemi, + (50, Inexact(1), Inexact(100), Inexact(5), Absent), + (20, Inexact(1), Inexact(100), Inexact(10), Absent), + Some(10), + ), + // RightAnti with explicit NDV: anti = outer_rows - semi = 20 - 10 = 10. + ( + JoinType::RightAnti, + (50, Inexact(1), Inexact(100), Inexact(5), Absent), + (20, Inexact(1), Inexact(100), Inexact(10), Absent), + Some(10), + ), + // RightSemi where right-side NDV (20) exceeds right-side row count (10): + // NDV is clamped to 10, so outer_ndv=10, inner_ndv=10, + // selectivity = min(10,10)/10 = 1.0, cardinality = ceil(10 * 1.0) = 10. + ( + JoinType::RightSemi, + (50, Inexact(1), Inexact(100), Inexact(10), Absent), + (10, Inexact(1), Inexact(100), Inexact(20), Absent), + Some(10), + ), + // RightAnti with NDV clamped by row count: anti = 10 - 10 = 0. + ( + JoinType::RightAnti, + (50, Inexact(1), Inexact(100), Inexact(10), Absent), + (10, Inexact(1), Inexact(100), Inexact(20), Absent), + Some(0), + ), + // Empty inner table: no match possible, semi → 0 + ( + JoinType::LeftSemi, + (100, Absent, Absent, Absent, Absent), + (0, Absent, Absent, Absent, Absent), + Some(0), + ), + // NDV-based semi with nulls on outer side: + // outer_ndv=20, inner_ndv=10, null_frac=10/100=0.1 + // selectivity = 10/20 * (1-0.1) = 0.5 * 0.9 = 0.45 + // semi = ceil(100 * 0.45) = 45 + ( + JoinType::LeftSemi, + (100, Absent, Absent, Inexact(20), Inexact(10)), + (200, Absent, Absent, Inexact(10), Absent), + Some(45), + ), + // Anti-join with nulls on outer side: + // semi=45, anti = 100 - 45 = 55 + ( + JoinType::LeftAnti, + (100, Absent, Absent, Inexact(20), Inexact(10)), + (200, Absent, Absent, Inexact(10), Absent), + Some(55), + ), + // All outer rows are null: null_frac=1.0 + // selectivity = 10/20 * (1-1.0) = 0.0, semi = 0 + ( + JoinType::LeftSemi, + (100, Absent, Absent, Inexact(20), Inexact(100)), + (200, Absent, Absent, Inexact(10), Absent), + Some(0), + ), + // All outer rows are null (anti): anti = 100 - 0 = 100 + ( + JoinType::LeftAnti, + (100, Absent, Absent, Inexact(20), Inexact(100)), + (200, Absent, Absent, Inexact(10), Absent), + Some(100), + ), ]; let join_on = vec![( @@ -2884,6 +3088,157 @@ mod tests { Ok(()) } + #[test] + fn test_semi_join_multi_column_and_mixed_stats() -> Result<()> { + let join_on = vec![ + ( + Arc::new(Column::new("l_col0", 0)) as _, + Arc::new(Column::new("r_col0", 0)) as _, + ), + ( + Arc::new(Column::new("l_col1", 1)) as _, + Arc::new(Column::new("r_col1", 1)) as _, + ), + ]; + + // Multi-column: both columns have NDV on both sides. + // col0: outer_ndv=20, inner_ndv=10 → selectivity = 10/20 = 0.5 + // col1: outer_ndv=40, inner_ndv=10 → selectivity = 10/40 = 0.25 + // total selectivity = 0.5 * 0.25 = 0.125 + // semi = ceil(100 * 0.125) = 13 + let result = estimate_join_cardinality( + &JoinType::LeftSemi, + Statistics { + num_rows: Inexact(100), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(20), Absent), + create_column_stats(Absent, Absent, Inexact(40), Absent), + ], + }, + Statistics { + num_rows: Inexact(200), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(10), Absent), + create_column_stats(Absent, Absent, Inexact(10), Absent), + ], + }, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(result, Some(13), "multi-column semi join"); + + // Multi-column anti: anti = 100 - 13 = 87 + let result = estimate_join_cardinality( + &JoinType::LeftAnti, + Statistics { + num_rows: Inexact(100), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(20), Absent), + create_column_stats(Absent, Absent, Inexact(40), Absent), + ], + }, + Statistics { + num_rows: Inexact(200), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(10), Absent), + create_column_stats(Absent, Absent, Inexact(10), Absent), + ], + }, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(result, Some(87), "multi-column anti join"); + + // Mixed stats: col0 has NDV on both sides, col1 has NDV only on outer. + // col1 is skipped (either side missing), so selectivity comes from col0 only. + // col0: outer_ndv=20, inner_ndv=10 → selectivity = 0.5 + // semi = ceil(100 * 0.5) = 50 + let result = estimate_join_cardinality( + &JoinType::LeftSemi, + Statistics { + num_rows: Inexact(100), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(20), Absent), + create_column_stats(Absent, Absent, Inexact(40), Absent), + ], + }, + Statistics { + num_rows: Inexact(200), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(10), Absent), + create_column_stats(Absent, Absent, Absent, Absent), + ], + }, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(result, Some(50), "mixed stats: col1 skipped"); + + // Mixed stats: neither column has stats on both sides → fallback to outer_rows + let result = estimate_join_cardinality( + &JoinType::LeftSemi, + Statistics { + num_rows: Inexact(100), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(20), Absent), + create_column_stats(Absent, Absent, Absent, Absent), + ], + }, + Statistics { + num_rows: Inexact(200), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Absent, Absent), + create_column_stats(Absent, Absent, Inexact(10), Absent), + ], + }, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!(result, Some(100), "no column has stats on both sides"); + + // Multi-column with nulls on one column: + // col0: outer_ndv=20, inner_ndv=10, null_frac=0.0 → 10/20 * 1.0 = 0.5 + // col1: outer_ndv=40, inner_ndv=10, null_frac=20/100=0.2 → 10/40 * 0.8 = 0.2 + // total selectivity = 0.5 * 0.2 = 0.1 + // semi = ceil(100 * 0.1) = 10 + let result = estimate_join_cardinality( + &JoinType::LeftSemi, + Statistics { + num_rows: Inexact(100), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(20), Absent), + create_column_stats(Absent, Absent, Inexact(40), Inexact(20)), + ], + }, + Statistics { + num_rows: Inexact(200), + total_byte_size: Absent, + column_statistics: vec![ + create_column_stats(Absent, Absent, Inexact(10), Absent), + create_column_stats(Absent, Absent, Inexact(10), Absent), + ], + }, + &join_on, + ) + .map(|c| c.num_rows); + assert_eq!( + result, + Some(10), + "multi-column semi join with nulls on one column" + ); + + Ok(()) + } + #[test] fn test_calculate_join_output_ordering() -> Result<()> { let left_ordering = LexOrdering::new(vec![ diff --git a/datafusion/sqllogictest/test_files/array/array_concat.slt b/datafusion/sqllogictest/test_files/array/array_concat.slt index 0f847811615c7..168b307a1e636 100644 --- a/datafusion/sqllogictest/test_files/array/array_concat.slt +++ b/datafusion/sqllogictest/test_files/array/array_concat.slt @@ -121,6 +121,38 @@ select ---- [1, 2, 3] List(Utf8View) +# Concatenating mixed list and large list — return type widens to LargeList +query ?T +select + array_concat(make_array(1, 2), arrow_cast([3, 4], 'LargeList(Int64)')), + arrow_typeof(array_concat(make_array(1, 2), arrow_cast([3, 4], 'LargeList(Int64)'))); +---- +[1, 2, 3, 4] LargeList(Int64) + +# Reverse argument order: LargeList first, plain list second +query ?T +select + array_concat(arrow_cast([1, 2], 'LargeList(Int64)'), make_array(3, 4)), + arrow_typeof(array_concat(arrow_cast([1, 2], 'LargeList(Int64)'), make_array(3, 4))); +---- +[1, 2, 3, 4] LargeList(Int64) + +# FixedSizeList combined with LargeList — also widens to LargeList +query ?T +select + array_concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), arrow_cast([3, 4], 'LargeList(Int64)')), + arrow_typeof(array_concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), arrow_cast([3, 4], 'LargeList(Int64)'))); +---- +[1, 2, 3, 4] LargeList(Int64) + +# Three-way mix: List, LargeList, List +query ?T +select + array_concat(make_array(1, 2), arrow_cast([3], 'LargeList(Int64)'), make_array(4, 5)), + arrow_typeof(array_concat(make_array(1, 2), arrow_cast([3], 'LargeList(Int64)'), make_array(4, 5))); +---- +[1, 2, 3, 4, 5] LargeList(Int64) + # array_concat with NULL elements inside arrays query ? select array_concat([1, NULL, 3], [NULL, 5]); diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index c277f69d0bee2..030ca729f265a 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -347,3 +347,28 @@ SELECT CAST(approx_percentile_cont(quantity, 0.5) AS BIGINT) FROM orders; ``` [#21074]: https://github.com/apache/datafusion/pull/21074 + +### `Box` and `Arc` `TreeNodeContainer` impls now require `C: Default` + +The generic `TreeNodeContainer` implementations for `Box` and `Arc` now +require `C: Default`. This change was necessary as part of optimizing tree +rewriting to reduce heap allocations. + +**Who is affected:** + +- Users that implement `TreeNodeContainer` on a custom type and wrap it in + `Box` or `Arc` when walking trees. + +**Migration guide:** + +Add a `Default` implementation to your type. The default value is used as a +temporary placeholder during query optimization, so when possible, pick a cheap, +allocation-free variant: + +```rust,ignore +impl Default for MyTreeNode { + fn default() -> Self { + MyTreeNode::Leaf // or whichever variant is cheapest to construct + } +} +``` diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 49c9eea29ef73..46039f3c99c27 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -101,7 +101,7 @@ The following configuration settings are available: | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 53.0.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 53.1.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | | datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page |