From dcdae4fc4a014c042a94e3e5f09e0f688c7253b6 Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Thu, 25 Jun 2026 16:10:19 -0400 Subject: [PATCH 01/10] feat: spatialbench wkb Signed-off-by: Nemo Yu --- Cargo.lock | 912 ++++++++++++------ Cargo.toml | 8 + benchmarks/duckdb-bench/src/lib.rs | 21 + benchmarks/duckdb-bench/src/main.rs | 3 +- vortex-bench/Cargo.toml | 3 + vortex-bench/spatialbench.sql | 172 ++++ vortex-bench/src/benchmark.rs | 8 + vortex-bench/src/datasets/mod.rs | 7 + vortex-bench/src/lib.rs | 11 + vortex-bench/src/spatialbench/benchmark.rs | 148 +++ vortex-bench/src/spatialbench/datagen/mod.rs | 11 + .../src/spatialbench/datagen/table.rs | 30 + vortex-bench/src/spatialbench/datagen/wkb.rs | 107 ++ vortex-bench/src/spatialbench/mod.rs | 11 + vortex-bench/src/v3.rs | 6 + 15 files changed, 1162 insertions(+), 296 deletions(-) create mode 100644 vortex-bench/spatialbench.sql create mode 100644 vortex-bench/src/spatialbench/benchmark.rs create mode 100644 vortex-bench/src/spatialbench/datagen/mod.rs create mode 100644 vortex-bench/src/spatialbench/datagen/table.rs create mode 100644 vortex-bench/src/spatialbench/datagen/wkb.rs create mode 100644 vortex-bench/src/spatialbench/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 66457601e06..1ad8b82a5b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,25 +196,57 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" +[[package]] +name = "arrow" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb98341a7e051bb79731ecb33ec00cbd6e0e315a542d6732b46d462c9215ea2" +dependencies = [ + "arrow-arith 56.2.1", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ord 56.2.1", + "arrow-row 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "arrow-string 56.2.1", +] + [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data", - "arrow-ipc", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", +] + +[[package]] +name = "arrow-arith" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce4751cbc4bcccfeeea79df9571ff1dc066d61e44723c7604d11c7937f5b560" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "num", ] [[package]] @@ -223,14 +255,30 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b02ccba2e977a3aabb4384036109ca32f552399a2bc0588f925f91ed073ce70c" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "half", + "hashbrown 0.16.1", + "num", +] + [[package]] name = "arrow-array" version = "58.3.0" @@ -238,9 +286,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", @@ -256,9 +304,9 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "049230728cd6e093088c8d231b4beede184e35cad7777c1505c0d5a8571f4376" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "bytes", "bzip2", "crc", @@ -274,6 +322,17 @@ dependencies = [ "zstd", ] +[[package]] +name = "arrow-buffer" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90f8bece6a9ee316a699fbbfde368a206676a1206ce89b50f07937648e76c3c" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-buffer" version = "58.3.0" @@ -286,18 +345,39 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ffe645cfb4e80b1ca37a3a106ce7b4af66ccdd60c655a57e6b9aab096164a7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -314,41 +394,67 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78468c813909465dd0f858950c8a0614eb63608134acf95c602ec21381258b28" +dependencies = [ + "arrow-buffer 56.2.1", + "arrow-schema 56.2.1", + "half", + "num", +] + [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f88b0fbb33af28089ccd3e4dcd0ff09de46842168d00220b920f7231feddf5" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.1", "zstd", ] @@ -358,12 +464,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -377,17 +483,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed58a38c3db0a2cf75ef70e3cb6bc4bd0da0a3d390de37c36139b31fae826e8" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", +] + [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", +] + +[[package]] +name = "arrow-row" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "079ced0517daf4f09b070d09ff641cee7cc331aa216bebcb25d1a6474ad53086" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "half", ] [[package]] @@ -396,13 +528,19 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] +[[package]] +name = "arrow-schema" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a0d5eb3fe25337ff83e8333a08379bdd1540b0961b1c888f6e505d971c198e1" + [[package]] name = "arrow-schema" version = "58.3.0" @@ -414,6 +552,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2368a78bd32902dba39d52519d70f63799c8b5dc8a9477129a30c2fd3dc70c19" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "num", +] + [[package]] name = "arrow-select" version = "58.3.0" @@ -421,24 +573,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dece58a130b9187756ded8bc071bd8ee9dd7a146566af244b297c7e632fd1ef7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -1391,11 +1560,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.2.2" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width 0.2.2", ] @@ -1418,8 +1588,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "clap", @@ -1427,7 +1597,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet", + "parquet 58.3.0", "regex", "tokio", "tracing", @@ -1943,8 +2113,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "chrono", @@ -1992,8 +2162,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "997a31e15872606a49478e670c58302094c97cb96abb0a7d60720f8e92170040" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bzip2", "chrono", @@ -2031,7 +2201,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "sqlparser 0.62.0", "tempfile", "tokio", @@ -2072,7 +2242,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 53.1.0", @@ -2097,7 +2267,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7dd61161508f8f5fa1107774ea687bd753c22d83a32eebf963549f89de14139" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2122,7 +2292,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2145,7 +2315,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897c70f871277f9ce99aa38347be0d679bbe3e617156c4d2a8378cec8a2a0891" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2169,8 +2339,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -2191,9 +2361,9 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c9ded5d87d9172319e006f2afdb9928d72dbacd6a90a458d8acb1e3b43a65" dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "chrono", "foldhash 0.2.0", "half", @@ -2203,7 +2373,7 @@ dependencies = [ "libc", "log", "object_store", - "parquet", + "parquet 58.3.0", "recursive", "sqlparser 0.62.0", "tokio", @@ -2239,7 +2409,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "chrono", @@ -2268,7 +2438,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd7d295b2ec7c00d8a56562f41ed41062cf0af75549ed891c12a0a09eddfefe" dependencies = [ - "arrow", + "arrow 58.3.0", "async-compression", "async-trait", "bytes", @@ -2304,8 +2474,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2328,8 +2498,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552b0b3f342f7ec41b3fbd70f6339dc82a30cfd0349e7f280e7852528085349f" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2352,7 +2522,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb517d08967d536284ce70afb5fe8583133779249f2d7b90587d339741a7f195" dependencies = [ - "arrow", + "arrow 58.3.0", "arrow-avro", "async-trait", "bytes", @@ -2371,7 +2541,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2394,7 +2564,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68850aa426b897e879c8b87e512ea8124f1d0a2869a4e51808ddaaddf1bc0ada" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2417,7 +2587,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2441,7 +2611,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402f93242ae08ef99139ee2c528a49d087efe88d5c7b2c3ff5480855a40ce54f" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2464,7 +2634,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd2499c1bee0eeccf6a57156105700eeeb17bc701899ac719183c4e74231450" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2485,7 +2655,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "tokio", ] @@ -2507,8 +2677,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "chrono", "dashmap", @@ -2530,8 +2700,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37a8643ab852eb68864e1b72ae789e8066282dce48eea6347ffb0aee33d1ccc0" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2552,7 +2722,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "chrono", "datafusion-common 53.1.0", @@ -2574,8 +2744,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6932f4d71eed9c8d9341476a2b845aadfabde5495d08dbcd8fc23881f49fa7a0" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "chrono", "datafusion-common 54.0.0", @@ -2597,7 +2767,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2610,7 +2780,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0225491839a31b1f7d2cb8092c2d50792e2fe1c1724e4e6d08e011f5feaf4ed2" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2622,8 +2792,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2654,8 +2824,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14872c47bfc3d21e53ec82f57074e6987a15941c1e2f43cde4ac6ae2746634e3" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2687,7 +2857,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2708,7 +2878,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75a2ca14e1b609be21e657e2d3130b2f446456b08393b377bb721a33952d2e09" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2730,7 +2900,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", "datafusion-physical-expr-common 53.1.0", @@ -2742,7 +2912,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ece74ba09092d2ef9c9b54a38445450aea292a1f8b04faf531936b723a24b3c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", "datafusion-physical-expr-common 54.0.0", @@ -2754,8 +2924,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2779,8 +2949,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3e3f9ee8ca59bf70518802107de6f1b88a9509efdc629fadc5de9d6b2d5ef5" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2804,7 +2974,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2820,7 +2990,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89161dffc22cf2b50f9f4b1bee83b5221d3b4ed7c2e37fd7aa2b22a5297b3a26" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2836,7 +3006,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-expr 53.1.0", @@ -2854,7 +3024,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7339345b226b3874037708bf5023ba1c2de705128f8457a095aae5ae9cb9c78" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-expr 54.0.0", @@ -2913,7 +3083,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", @@ -2932,7 +3102,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77f20e8cf9e8654d92f4c16b24c487353ee5bf153ffc12d5772cd399ab8cd281" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", @@ -2953,7 +3123,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-expr-common 53.1.0", @@ -2975,7 +3145,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f015a4a82f6f7ff7e1d8d4bf3870a936752fa38b17705dfcc14adef95aa8922c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-expr-common 54.0.0", @@ -2997,7 +3167,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-functions 53.1.0", @@ -3012,7 +3182,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51e6ffff8acdfe54e0ea15ccf38115c4a9184433b0439f42907637928d00a235" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-functions 54.0.0", @@ -3028,7 +3198,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", @@ -3044,7 +3214,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7967a3e171c6a4bf09474b3f7a14f1a3db13ed1714ba12156f33fcce2bba54e8" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", @@ -3061,7 +3231,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-execution 53.1.0", "datafusion-expr 53.1.0", @@ -3079,7 +3249,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59ff803e2a96054cb6d83f35f9e60fd4f42eac515e1932bd1b2dbc91d5fcbf36" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-execution 54.0.0", "datafusion-expr 54.0.0", @@ -3099,9 +3269,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 53.1.0", "datafusion-common-runtime 53.1.0", @@ -3130,11 +3300,11 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "776ee54d47d15bdb126452f9ca17b03761e3b004682914beaedd3f86eb507fbc" dependencies = [ - "arrow", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 54.0.0", "datafusion-common-runtime 54.0.0", @@ -3163,7 +3333,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-datasource 53.1.0", "datafusion-expr-common 53.1.0", @@ -3180,7 +3350,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fb9e5774660aa69c3ba93c610f175f75b65cb8c3776edb3626de8f3a4f4ee3" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-datasource 54.0.0", "datafusion-expr-common 54.0.0", @@ -3224,7 +3394,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "390bb0bf37cb2b95ffd65eacd66f60df50793d1f94097799e416f39477a51957" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "crc32fast", @@ -3254,7 +3424,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 53.1.0", @@ -3272,7 +3442,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6094ad36a3ed6d7ac87b20b479b2d0b118250f66cf997603829fdc65b44a7099" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 54.0.0", @@ -3291,7 +3461,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0c08025966108056d3547d879c4d39e246277494f59ca12920f78187d95eea1" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bigdecimal", "clap", @@ -3833,7 +4003,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcd0ce0249ac12fd44fcde62d435c36d881952c2f0df4d1de24b45e1dbba5ddb" dependencies = [ - "arrow-array", + "arrow-array 58.3.0", "rand 0.9.4", ] @@ -4058,14 +4228,14 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dafe7b7de3fab1a8b7099fd6a6434ca955fa65065f9c19f0f8a133693f3c2b0e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "geo-traits", "geoarrow-schema", "num-traits", "wkb", - "wkt", + "wkt 0.14.0", ] [[package]] @@ -4074,7 +4244,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d4a7edb2a1d87024a93805332a9c8184a0354836271d42c0d18cf628a5e3cd0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "geo-traits", "serde", "serde_json", @@ -4090,6 +4260,33 @@ dependencies = [ "libm", ] +[[package]] +name = "geojson" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e26f3c45b36fccc9cf2805e61d4da6bc4bbd5a3a9589b01afa3a40eff703bd79" +dependencies = [ + "log", + "serde", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "geozero" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f28f34864745eb2f123c990c6ffd92c1584bd39439b3f27ff2a0f4ea5b309b" +dependencies = [ + "geo-types", + "geojson", + "log", + "scroll", + "serde_json", + "thiserror 1.0.69", + "wkt 0.11.1", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -5014,16 +5211,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3944aca86f4c78f4da04af1c2bf33e664a2826b7af72972ad200d6b9de59019f" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ipc", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "async_cell", @@ -5085,13 +5282,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "253f4a0a70580c985b91e65e9ca6cad644825a4078de28d8efbacf3ffbd7ecdc" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytes", "futures", "getrandom 0.2.17", @@ -5106,13 +5303,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast", + "arrow-cast 58.3.0", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet", + "parquet 58.3.0", "tempfile", "tokio", "tracing", @@ -5136,9 +5333,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13f84020da5a484e2f07dd1796e09785ed7cd889857ebc4cb77e32ef214ee594" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5173,13 +5370,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7460597a66534a75987993d4dac5bc330586d99c5b79ae73367dbcbd4e29e576" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "chrono", "datafusion 53.1.0", @@ -5205,10 +5402,10 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046f5506ed2271cd941a050de7bf535dd3aedc291aadec836a63fa56c5926e3b" dependencies = [ - "arrow", - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "futures", "half", @@ -5225,13 +5422,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7af54edf43dcf9d6a56cc636eb35d457e68373c6448dca3f0891b3325b4a24e6" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytemuck", "byteorder", "bytes", @@ -5262,12 +5459,12 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0772ae2d6207995dc1eb28aff9507f78e90b3362b58f311da001e9dc25f3d736" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5296,12 +5493,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e71fbfb51096a903cb524fe0da716f5f15fbc4a6b6f84cd6dec21abf319c5e84" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-channel", "async-recursion", "async-trait", @@ -5360,14 +5557,14 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab8c98ef1b870b20541d27f3ca4efdf7c9f5c25214233be07d231ba88900219" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5400,9 +5597,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4c51cad0ac780b02dc4da48528479e7693c03e8d05390510bbc69ca2a9a1f1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "cc", "deepsize", "half", @@ -5418,7 +5615,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "014e8332ca0615506342e0d3af608639864b68396973be14239f09c9f21f1fc2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "lance-core", @@ -5446,11 +5643,11 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5793,6 +5990,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.1" @@ -6135,6 +6341,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -6169,6 +6389,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -6523,6 +6765,41 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3abbfef8a25900f4925c86e4cb881ea24672ca3c31ee4fb50a8083c4c56d313" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ipc 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.11.6", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.3.0" @@ -6530,12 +6807,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -6544,7 +6821,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex", + "lz4_flex 0.13.1", "num-bigint", "num-integer", "num-traits", @@ -6565,8 +6842,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6581,8 +6858,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6598,7 +6875,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "base64", "chrono", "parquet-variant", @@ -7989,6 +8266,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" + [[package]] name = "seahash" version = "4.1.0" @@ -8410,6 +8693,30 @@ dependencies = [ "smallvec", ] +[[package]] +name = "spatialbench" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07f3f4b67ccf571f183d3695aa6b9d6f996864c31782a480e97a23ed0f2f6f18" +dependencies = [ + "geo", + "once_cell", + "rand 0.8.6", + "serde", +] + +[[package]] +name = "spatialbench-arrow" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad89c32ed9e258bcc89713c296c7437963ce31f511eb8a408d2046e853294206" +dependencies = [ + "arrow 56.2.1", + "geo", + "geozero", + "spatialbench", +] + [[package]] name = "sqllogictest" version = "0.29.1" @@ -9145,7 +9452,7 @@ name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow", + "arrow 58.3.0", "tpchgen", ] @@ -9426,12 +9733,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", + "arrow-array 58.3.0", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet", + "parquet 58.3.0", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9497,15 +9804,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", "async-lock", "bytes", "cfg-if", @@ -9570,9 +9877,9 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "bzip2", "clap", @@ -9587,12 +9894,15 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet", + "parquet 56.2.1", + "parquet 58.3.0", "rand 0.10.1", "regex", "reqwest 0.13.4", "serde", "serde_json", + "spatialbench", + "spatialbench-arrow", "sysinfo", "tabled", "target-lexicon", @@ -9645,7 +9955,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "bitvec", "bytes", "codspeed-divan-compat", @@ -9677,13 +9987,13 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-select", + "arrow-array 58.3.0", + "arrow-select 58.3.0", "base16ct", "bytes", "clap", "futures", - "parquet", + "parquet 58.3.0", "reqwest 0.13.4", "serde", "serde_json", @@ -9725,11 +10035,11 @@ dependencies = [ name = "vortex-compute" version = "0.1.0" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-schema", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "num-traits", "rand 0.10.1", @@ -9752,7 +10062,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "bindgen", "bytes", @@ -9783,7 +10093,7 @@ dependencies = [ name = "vortex-cuda-ffi" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-array", @@ -9805,8 +10115,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cxx", "futures", @@ -9820,8 +10130,8 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion 54.0.0", "datafusion-catalog 54.0.0", @@ -9920,7 +10230,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "flatbuffers", "jiff", "object_store", @@ -9955,8 +10265,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "cbindgen", @@ -10078,8 +10388,8 @@ dependencies = [ name = "vortex-geo" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "geo", "geo-traits", "geo-types", @@ -10145,8 +10455,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "futures", "jni", @@ -10165,8 +10475,8 @@ dependencies = [ name = "vortex-json" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "prost 0.14.4", "vortex-array", "vortex-error", @@ -10179,8 +10489,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-stream", "async-trait", "bit-vec", @@ -10270,9 +10580,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "chrono", "parquet-variant", "parquet-variant-compute", @@ -10317,9 +10627,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "itertools 0.14.0", @@ -10349,9 +10659,9 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-row", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", "bytes", "codspeed-divan-compat", "mimalloc", @@ -10370,8 +10680,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10467,8 +10777,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10491,8 +10801,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-cuda", @@ -10503,8 +10813,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "clap", "console_error_panic_hook", "crossterm", @@ -10517,7 +10827,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet", + "parquet 58.3.0", "ratatui", "ratzilla", "serde", @@ -10544,9 +10854,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "console_error_panic_hook", "futures", "js-sys", @@ -11090,6 +11400,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "wkt" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54f7f1ff4ea4c18936d6cd26a6fd24f0003af37e951a8e0e8b9e9a2d0bd0a46d" +dependencies = [ + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "wkt" version = "0.14.0" diff --git a/Cargo.toml b/Cargo.toml index deed8b8d58a..903a6a46b10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -242,6 +242,14 @@ similar = "3.0.0" sketches-ddsketch = "0.4.0" smallvec = "1.15.1" smol = "2.0.2" +spatialbench = "0.2" +spatialbench-arrow = "0.2" +# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream +# catches up, write its generated batches with a matching parquet instead of converting +# arrow versions at the boundary. +spatialbench-parquet = { package = "parquet", version = "56", features = [ + "async", +] } static_assertions = "1.1" strum = "0.28" syn = { version = "2.0.117", features = ["full"] } diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index 69e4da63853..4ec1efd0993 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -26,6 +26,8 @@ pub struct DuckClient { connection: Option, pub db_path: PathBuf, pub threads: Option, + /// `INSTALL spatial; LOAD spatial;` for SpatialBench. + init_sql: Vec, } impl DuckClient { @@ -67,9 +69,19 @@ impl DuckClient { connection: Some(connection), db_path, threads, + init_sql: Vec::new(), }) } + /// Run `statements` now and after every subsequent [`DuckClient::reopen`]. + pub fn set_init_sql(&mut self, statements: Vec) -> Result<()> { + for stmt in &statements { + self.connection().query(stmt)?; + } + self.init_sql = statements; + Ok(()) + } + pub fn open_and_setup_database( path: Option, threads: Option, @@ -108,6 +120,14 @@ impl DuckClient { self.db = Some(db); self.connection = Some(connection); + // Replay init SQL (e.g. LOAD spatial). + for stmt in &self.init_sql { + self.connection + .as_ref() + .vortex_expect("connection just opened") + .query(stmt)?; + } + Ok(()) } @@ -123,6 +143,7 @@ impl DuckClient { connection: Some(connection), db_path, threads: None, + init_sql: Vec::new(), }) } diff --git a/benchmarks/duckdb-bench/src/main.rs b/benchmarks/duckdb-bench/src/main.rs index 8ba4937f566..cf4fa071067 100644 --- a/benchmarks/duckdb-bench/src/main.rs +++ b/benchmarks/duckdb-bench/src/main.rs @@ -171,12 +171,13 @@ fn main() -> anyhow::Result<()> { &filtered_queries, mode, |format| { - let ctx = DuckClient::new( + let mut ctx = DuckClient::new( &*benchmark, format, args.delete_duckdb_database, args.threads, )?; + ctx.set_init_sql(benchmark.engine_init_sql(Engine::DuckDB))?; ctx.register_tables(&*benchmark, format)?; // Duckdb doesn't support octet_length for strings but we need this diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 3b793c6124a..96180f8bcff 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -48,6 +48,9 @@ regex = { workspace = true } reqwest = { workspace = true, features = ["stream"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } +spatialbench = { workspace = true } +spatialbench-arrow = { workspace = true } +spatialbench-parquet = { workspace = true } sysinfo = { workspace = true } tabled = { workspace = true, features = ["std"] } target-lexicon = { workspace = true } diff --git a/vortex-bench/spatialbench.sql b/vortex-bench/spatialbench.sql new file mode 100644 index 00000000000..d566dc80186 --- /dev/null +++ b/vortex-bench/spatialbench.sql @@ -0,0 +1,172 @@ +-- SpatialBench queries (DuckDB dialect), from sedona-spatialbench DuckDBSpatialBenchBenchmark +-- (spatialbench-queries/print_queries.py). Query logic is unchanged, only reformatted for readability +-- and numbered Q1..Q12 (canonical order). The harness splits the file on semicolons, so a comment +-- must never contain one. + +-- Q1: trips starting within 50km of Sedona city center, ordered by distance. +SELECT + t.t_tripkey, + ST_X(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lon, + ST_Y(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lat, + t.t_pickuptime, + ST_Distance(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT (-111.7610 34.8697)')) AS distance_to_center +FROM trip t +WHERE ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT (-111.7610 34.8697)'), 0.45) +ORDER BY distance_to_center ASC, t.t_tripkey ASC; + +-- Q2: count trips starting within the Coconino County (Arizona) zone. +SELECT COUNT(*) AS trip_count_in_coconino_county +FROM trip t +WHERE ST_Intersects( + ST_GeomFromWKB(t.t_pickuploc), + (SELECT ST_GeomFromWKB(z.z_boundary) FROM zone z WHERE z.z_name = 'Coconino County' LIMIT 1) +); + +-- Q3: monthly trip statistics within 15km of Sedona city center (10km bbox + 5km buffer). +SELECT + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + COUNT(t.t_tripkey) AS total_trips, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_fare) AS avg_fare +FROM trip t +WHERE ST_DWithin( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromText('POLYGON((-111.9060 34.7347, -111.6160 34.7347, -111.6160 35.0047, -111.9060 35.0047, -111.9060 34.7347))'), + 0.045 +) +GROUP BY pickup_month +ORDER BY pickup_month; + +-- Q4: zone distribution of the top 1000 trips by tip amount. +SELECT z.z_zonekey, z.z_name, COUNT(*) AS trip_count +FROM zone z +JOIN ( + SELECT t.t_pickuploc + FROM trip t + ORDER BY t.t_tip DESC, t.t_tripkey ASC + LIMIT 1000 +) top_trips ON ST_Within(ST_GeomFromWKB(top_trips.t_pickuploc), ST_GeomFromWKB(z.z_boundary)) +GROUP BY z.z_zonekey, z.z_name +ORDER BY trip_count DESC, z.z_zonekey ASC; + +-- Q5: monthly travel patterns for repeat customers (convex hull of dropoff locations). +SELECT + c.c_custkey, + c.c_name AS customer_name, + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + ST_Area(ST_ConvexHull(ST_Collect(ARRAY_AGG(ST_GeomFromWKB(t.t_dropoffloc))))) AS monthly_travel_hull_area, + COUNT(*) AS dropoff_count +FROM trip t +JOIN customer c ON t.t_custkey = c.c_custkey +GROUP BY c.c_custkey, c.c_name, pickup_month +HAVING dropoff_count > 5 +ORDER BY dropoff_count DESC, c.c_custkey ASC; + +-- Q6: zone statistics for trips intersecting a bounding box. +SELECT + z.z_zonekey, + z.z_name, + COUNT(t.t_tripkey) AS total_pickups, + AVG(t.t_totalamount) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration +FROM trip t, zone z +WHERE ST_Intersects( + ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110 34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'), + ST_GeomFromWKB(z.z_boundary) +) +AND ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary)) +GROUP BY z.z_zonekey, z.z_name +ORDER BY total_pickups DESC, z.z_zonekey ASC; + +-- Q7: detect potential route detours by comparing reported vs. geometric distances. +WITH trip_lengths AS ( + SELECT + t.t_tripkey, + t.t_distance AS reported_distance_m, + ST_Length(ST_MakeLine(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(t.t_dropoffloc))) / 0.000009 AS line_distance_m + FROM trip t +) +SELECT + t.t_tripkey, + t.reported_distance_m, + t.line_distance_m, + t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio +FROM trip_lengths t +ORDER BY detour_ratio DESC NULLS LAST, reported_distance_m DESC, t_tripkey ASC; + +-- Q8: count nearby pickups for each building within ~500m. +SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count +FROM trip t +JOIN building b ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary), 0.0045) +GROUP BY b.b_buildingkey, b.b_name +ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC; + +-- Q9: building conflation (duplicate/overlap detection via IoU). +WITH b1 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom FROM building +), +b2 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom FROM building +), +pairs AS ( + SELECT + b1.id AS building_1, + b2.id AS building_2, + ST_Area(b1.geom) AS area1, + ST_Area(b2.geom) AS area2, + ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area + FROM b1 + JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom) +) +SELECT + building_1, + building_2, + area1, + area2, + overlap_area, + CASE + WHEN overlap_area = 0 THEN 0.0 + WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0 + ELSE overlap_area / (area1 + area2 - overlap_area) + END AS iou +FROM pairs +ORDER BY iou DESC, building_1 ASC, building_2 ASC; + +-- Q10: zone statistics for trips starting within each zone. +SELECT + z.z_zonekey, + z.z_name AS pickup_zone, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_distance) AS avg_distance, + COUNT(t.t_tripkey) AS num_trips +FROM zone z +LEFT JOIN trip t ON ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary)) +GROUP BY z.z_zonekey, z.z_name +ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC; + +-- Q11: count trips that cross between different zones. +SELECT COUNT(*) AS cross_zone_trip_count +FROM trip t +JOIN zone pickup_zone ON ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(pickup_zone.z_boundary)) +JOIN zone dropoff_zone ON ST_Within(ST_GeomFromWKB(t.t_dropoffloc), ST_GeomFromWKB(dropoff_zone.z_boundary)) +WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey; + +-- Q12: five nearest buildings per trip pickup (CROSS JOIN LATERAL, since DuckDB spatial has no ST_KNN). +SELECT + t.t_tripkey, + t.t_pickuploc, + nb.b_buildingkey, + nb.building_name, + nb.distance_to_building +FROM trip t +CROSS JOIN LATERAL ( + SELECT + b.b_buildingkey, + b.b_name AS building_name, + ST_Distance(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary)) AS distance_to_building + FROM building b + ORDER BY distance_to_building + LIMIT 5 +) AS nb +ORDER BY nb.distance_to_building, nb.b_buildingkey; diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index 2872a02aa64..a7a76585b70 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -8,6 +8,7 @@ use glob::Pattern; use url::Url; use crate::BenchmarkDataset; +use crate::Engine; use crate::Format; /// Specification for a table in a benchmark dataset. @@ -32,6 +33,13 @@ pub trait Benchmark: Send + Sync { /// Get all available queries for this benchmark fn queries(&self) -> anyhow::Result>; + /// SQL an `engine` must run before this benchmark's queries (e.g. loading engine + /// extensions). Runners replay these after every (re)open. Default: none. + fn engine_init_sql(&self, engine: Engine) -> Vec { + let _ = engine; + Vec::new() + } + /// Generate or prepare base data for the benchmark (typically Parquet format). /// This is the canonical source data that can be converted to other formats. /// This should be idempotent - safe to call multiple times. diff --git a/vortex-bench/src/datasets/mod.rs b/vortex-bench/src/datasets/mod.rs index d5c712404d5..cf00a25c3d7 100644 --- a/vortex-bench/src/datasets/mod.rs +++ b/vortex-bench/src/datasets/mod.rs @@ -71,6 +71,8 @@ pub enum BenchmarkDataset { ClickBenchSorted, #[serde(rename = "public-bi")] PublicBi { name: String }, + #[serde(rename = "spatialbench")] + SpatialBench { scale_factor: String }, #[serde(rename = "statpopgen")] StatPopGen { n_rows: u64 }, #[serde(rename = "polarsignals")] @@ -90,6 +92,7 @@ impl BenchmarkDataset { BenchmarkDataset::ClickBench { .. } => "clickbench", BenchmarkDataset::ClickBenchSorted => "clickbench-sorted", BenchmarkDataset::PublicBi { .. } => "public-bi", + BenchmarkDataset::SpatialBench { .. } => "spatialbench", BenchmarkDataset::StatPopGen { .. } => "statpopgen", BenchmarkDataset::PolarSignals { .. } => "polarsignals", BenchmarkDataset::Fineweb => "fineweb", @@ -110,6 +113,9 @@ impl Display for BenchmarkDataset { }, BenchmarkDataset::ClickBenchSorted => write!(f, "clickbench-sorted"), BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"), + BenchmarkDataset::SpatialBench { scale_factor } => { + write!(f, "spatialbench(sf={scale_factor})") + } BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"), BenchmarkDataset::PolarSignals { n_rows } => { write!(f, "polarsignals(n_rows={n_rows})") @@ -168,6 +174,7 @@ impl BenchmarkDataset { ], BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::ClickBenchSorted => &["hits"], BenchmarkDataset::PublicBi { .. } => todo!(), + BenchmarkDataset::SpatialBench { .. } => &["trip", "building", "zone"], BenchmarkDataset::StatPopGen { .. } => &["statpopgen"], BenchmarkDataset::PolarSignals { .. } => &["stacktraces"], BenchmarkDataset::Fineweb => &["fineweb"], diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 8981b4859cf..daef12c7e78 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -35,6 +35,8 @@ use vortex::file::VortexWriteOptions; use vortex::file::WriteStrategyBuilder; use vortex::utils::aliases::hash_map::HashMap; +use crate::spatialbench::SpatialBenchBenchmark; + pub mod appian; pub mod benchmark; pub mod clickbench; @@ -52,6 +54,7 @@ pub mod public_bi; pub mod random_access; pub mod realnest; pub mod runner; +pub mod spatialbench; pub mod statpopgen; pub mod tpcds; pub mod tpch; @@ -268,6 +271,8 @@ pub enum BenchmarkArg { PolarSignals, #[clap(name = "public-bi")] PublicBi, + #[clap(name = "spatialbench")] + SpatialBench, } /// Default scale factor for TPC-related benchmarks @@ -334,6 +339,12 @@ pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result { + let scale_factor = opts.get(SCALE_FACTOR_KEY).unwrap_or(DEFAULT_SCALE_FACTOR); + let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); + let benchmark = SpatialBenchBenchmark::new(scale_factor.to_string(), remote_data_dir)?; + Ok(Box::new(benchmark) as _) + } } } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs new file mode 100644 index 00000000000..b619a067609 --- /dev/null +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench benchmark implementation + +use std::fs; + +use url::Url; + +use crate::Benchmark; +use crate::BenchmarkDataset; +use crate::Engine; +use crate::Format; +use crate::TableSpec; +use crate::spatialbench::datagen; +use crate::utils::file::resolve_data_url; +use crate::workspace_root; + +/// SpatialBench geospatial benchmark (Apache Sedona): a `trip` point table, `building` polygons, and +/// a `customer` attribute table, queried with spatial filters and joins. `zone` polygons are sourced +/// externally and registered when present. See . +pub struct SpatialBenchBenchmark { + pub scale_factor: String, + pub data_url: Url, +} + +impl SpatialBenchBenchmark { + pub fn new(scale_factor: String, use_remote_data_dir: Option) -> anyhow::Result { + Ok(Self { + data_url: resolve_data_url( + use_remote_data_dir.as_deref(), + &format!("spatialbench/{scale_factor}"), + )?, + scale_factor, + }) + } +} + +#[async_trait::async_trait] +impl Benchmark for SpatialBenchBenchmark { + /// All SpatialBench queries, numbered Q1.. in `spatialbench.sql` file order (1-based, matching + /// canonical SpatialBench). Geometry is stored as WKB and read back as a DuckDB `BLOB` (via + /// `ST_GeomFromWKB`), so the `spatial` extension evaluates every `ST_*` predicate — no native + /// geometry support is needed on the Vortex side. + fn queries(&self) -> anyhow::Result> { + // `;`-separated; a `;` must not appear in a comment, or it would split a statement in two. + let queries_file = workspace_root() + .join("vortex-bench") + .join("spatialbench") + .with_extension("sql"); + let contents = fs::read_to_string(queries_file)?; + Ok(contents + .split_terminator(';') + .map(str::trim) + .filter(|stmt| !stmt.is_empty()) + .enumerate() + .map(|(idx, stmt)| (idx + 1, stmt.to_string())) + .collect()) + } + + async fn generate_base_data(&self) -> anyhow::Result<()> { + if self.data_url.scheme() != "file" { + return Ok(()); + } + let base_data_dir = self + .data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", self.data_url.as_str()))?; + datagen::generate_tables(&self.scale_factor, base_data_dir).await?; + Ok(()) + } + + fn data_url(&self) -> &Url { + &self.data_url + } + + fn expected_row_counts(&self) -> Option> { + // Indexed by `query_idx` (1-based), so index 0 is a dummy and Q1's count is at index 1 (TPC-H + // convention). Only SF1.0 and SF10.0 are validated (like TPC-H); other scale factors return + // `None`. Each vec covers Q1..Q9 — the queries that finish — and is identical for Parquet and + // Vortex. Q10..Q12 are heavy spatial joins that time out, so they are left unvalidated (a + // shorter vec means the runner skips them). + match self.scale_factor.as_str() { + "1.0" => Some(vec![0, 94, 1, 22, 258, 316691, 3, 6000000, 369, 37]), + "10.0" => Some(vec![0, 994, 1, 79, 231, 3144328, 3, 60000000, 9357, 573]), + _ => None, + } + } + + fn dataset(&self) -> BenchmarkDataset { + BenchmarkDataset::SpatialBench { + scale_factor: self.scale_factor.clone(), + } + } + + fn dataset_name(&self) -> &str { + "spatialbench" + } + + fn dataset_display(&self) -> String { + format!("spatialbench(sf={})", self.scale_factor) + } + + fn table_specs(&self) -> Vec { + let mut specs = vec![ + TableSpec::new("trip", None), + TableSpec::new("building", None), + TableSpec::new("customer", None), + ]; + // `zone` is externally sourced and optional; register it only when present so queries that + // don't need it don't fail on the missing glob. + let zone_present = match self.data_url.to_file_path() { + Ok(base) => zone_parquet_present(&base.join(Format::Parquet.name())), + Err(()) => true, + }; + if zone_present { + specs.push(TableSpec::new("zone", None)); + } + specs + } + + /// Scope each table to its own `{table}_*.{ext}` files; the default globs every file in the + /// format dir, conflating the `trip` and `building` schemas. + fn pattern(&self, table_name: &str, format: Format) -> Option { + Some( + format!("{}_*.{}", table_name, format.ext()) + .parse() + .expect("valid glob pattern"), + ) + } + + /// DuckDB needs the `spatial` extension for `ST_*`; the runner replays it on each (re)open. + /// First INSTALL needs network. + fn engine_init_sql(&self, engine: Engine) -> Vec { + match engine { + Engine::DuckDB => vec!["INSTALL spatial;".to_string(), "LOAD spatial;".to_string()], + _ => Vec::new(), + } + } +} + +/// Whether an externally-sourced `zone_*.parquet` exists under `parquet_dir` (generated by the +/// upstream `spatialbench-cli`; see the module docs). +fn zone_parquet_present(parquet_dir: &std::path::Path) -> bool { + glob::glob(&parquet_dir.join("zone_*.parquet").to_string_lossy()) + .map(|mut paths| paths.next().is_some()) + .unwrap_or(false) +} diff --git a/vortex-bench/src/spatialbench/datagen/mod.rs b/vortex-bench/src/spatialbench/datagen/mod.rs new file mode 100644 index 00000000000..8ebd1b35d86 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables (Parquet + Vortex); +//! the [`table`] catalog is the single source of truth for the base tables. + +pub mod table; +pub mod wkb; + +pub use table::Table; +pub use wkb::generate_tables; diff --git a/vortex-bench/src/spatialbench/datagen/table.rs b/vortex-bench/src/spatialbench/datagen/table.rs new file mode 100644 index 00000000000..c924428cc0e --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/table.rs @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The shared SpatialBench table catalog: one source of truth for the base tables generated by +//! [`super::wkb`]. + +/// A SpatialBench base table. +#[derive(Clone, Copy)] +pub enum Table { + Trip, + Building, + Customer, + Zone, +} + +/// Base tables generated in-process from the scale factor. `Zone` is excluded — it is sourced +/// externally (the `spatialbench` crate ships no zone generator). +pub(crate) const TABLES: &[Table] = &[Table::Trip, Table::Building, Table::Customer]; + +impl Table { + /// File stem under a format directory, e.g. `Trip` → `trip_{part}.parquet`. + pub(crate) fn name(self) -> &'static str { + match self { + Table::Trip => "trip", + Table::Building => "building", + Table::Customer => "customer", + Table::Zone => "zone", + } + } +} diff --git a/vortex-bench/src/spatialbench/datagen/wkb.rs b/vortex-bench/src/spatialbench/datagen/wkb.rs new file mode 100644 index 00000000000..56f2d092d88 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/wkb.rs @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench WKB base-table generation via the `spatialbench` crates (a tpchgen-rs fork). +//! Geometry is emitted as WKB, which DuckDB reads directly as `GEOMETRY` via `ST_GeomFromWKB`. + +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Result; +// spatialbench emits arrow-56 batches, so they must be written with its matching arrow-56 +// parquet crate, not the workspace's arrow-58 one. The parquet file itself is version-neutral. +use spatialbench::generators::BuildingGenerator; +use spatialbench::generators::CustomerGenerator; +use spatialbench::generators::TripGenerator; +use spatialbench_arrow::BuildingArrow; +use spatialbench_arrow::CustomerArrow; +use spatialbench_arrow::RecordBatchIterator; +use spatialbench_arrow::TripArrow; +use spatialbench_parquet::arrow::AsyncArrowWriter; +use spatialbench_parquet::basic::Compression; +use spatialbench_parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use tracing::info; + +use super::table::TABLES; +use super::table::Table; +use crate::Format; +use crate::utils::file::idempotent_async; + +/// Batch size matching the TPC-H generator's streaming batches. +const BATCH_SIZE: usize = 8192 * 64; + +/// Batch iterator for one partition of `table`, from the arrow-56 `spatialbench` crates. +fn iterator( + table: Table, + scale_factor: f64, + part: i32, + part_count: i32, +) -> Box { + match table { + Table::Trip => Box::new( + TripArrow::new(TripGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + Table::Building => Box::new( + BuildingArrow::new(BuildingGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + Table::Customer => Box::new( + CustomerArrow::new(CustomerGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + // Zone is sourced externally (the published `spatialbench` crate has no generator); it is + // never emitted by `generate_tables`, which only iterates `TABLES`. + Table::Zone => unreachable!("zone is sourced externally, not generated in-process"), + } +} + +/// Generate the SpatialBench base tables as parquet under `{output_dir}/parquet/`. +pub async fn generate_tables(scale_factor: &str, output_dir: PathBuf) -> Result<()> { + let scale_factor = scale_factor.parse::()?; + let parquet_dir = output_dir.join(Format::Parquet.name()); + fs::create_dir_all(&parquet_dir)?; + + // One part per unit of scale factor keeps each file near the ~350MB the trip generator + // produces at SF1. + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let num_parts = (scale_factor.ceil() as usize).max(1); + let part_count = i32::try_from(num_parts)?; + + for &table in TABLES { + for part_idx in 0..num_parts { + let output_file = parquet_dir.join(format!("{}_{part_idx}.parquet", table.name())); + let part = i32::try_from(part_idx + 1)?; + + idempotent_async(output_file.to_string_lossy().as_ref(), |path| async move { + info!( + scale_factor, + part, + part_count, + table = table.name(), + "Generating SpatialBench table" + ); + + let iter = iterator(table, scale_factor, part, part_count); + let schema = Arc::clone(iter.schema()); + + let file = TokioFile::create(&path).await?; + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = AsyncArrowWriter::try_new(file, schema, Some(props))?; + for batch in iter { + writer.write(&batch).await?; + } + writer.close().await?; + + Ok::<(), anyhow::Error>(()) + }) + .await?; + } + } + + Ok(()) +} diff --git a/vortex-bench/src/spatialbench/mod.rs b/vortex-bench/src/spatialbench/mod.rs new file mode 100644 index 00000000000..bba06bd7ef9 --- /dev/null +++ b/vortex-bench/src/spatialbench/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench geospatial analytics benchmark. +//! +//! See . + +pub mod benchmark; +pub mod datagen; + +pub use benchmark::SpatialBenchBenchmark; diff --git a/vortex-bench/src/v3.rs b/vortex-bench/src/v3.rs index 17456efc682..a7529866f80 100644 --- a/vortex-bench/src/v3.rs +++ b/vortex-bench/src/v3.rs @@ -295,6 +295,7 @@ fn canonical_tpc_scale_factor(scale_factor: &str) -> String { /// | `GhArchive` | `gharchive` | `None` | `None` | | /// | `Appian` | `appian` | `None` | `None` | Static dataset; no scale factor. | /// | `PublicBi { name }` | `public-bi` | dataset name (e.g. `cms-provider`) | `None` | Sub-dataset name lives in `dataset_variant`. | +/// | `SpatialBench { scale_factor }` | `spatialbench` | `None` | SF as string | Same canonicalization as TPC-H; no historical v2 records to merge with. | pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, Option) { match d { BenchmarkDataset::TpcH { scale_factor } => ( @@ -320,6 +321,11 @@ pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, // live). Drop it to keep live ingests merging into the migrated // group. The dataset-level `n_rows` is recoverable from the bench // matrix if ever needed. + BenchmarkDataset::SpatialBench { scale_factor } => ( + "spatialbench".to_string(), + None, + Some(canonical_tpc_scale_factor(scale_factor)), + ), BenchmarkDataset::StatPopGen { .. } => ("statpopgen".to_string(), None, None), BenchmarkDataset::PolarSignals { .. } => ("polarsignals".to_string(), None, None), BenchmarkDataset::Fineweb => ("fineweb".to_string(), None, None), From 2bb398f266036b87c112d3fe53c468360b958a0b Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Fri, 26 Jun 2026 10:38:32 -0400 Subject: [PATCH 02/10] fix: lint Signed-off-by: Nemo Yu --- vortex-bench/src/benchmark.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index a7a76585b70..fe16df5cd3d 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -35,8 +35,7 @@ pub trait Benchmark: Send + Sync { /// SQL an `engine` must run before this benchmark's queries (e.g. loading engine /// extensions). Runners replay these after every (re)open. Default: none. - fn engine_init_sql(&self, engine: Engine) -> Vec { - let _ = engine; + fn engine_init_sql(&self, _engine: Engine) -> Vec { Vec::new() } From 816ef7b1ae6b2c1f18d4f3dacd8ef3fcacbff12c Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Fri, 26 Jun 2026 13:38:47 -0400 Subject: [PATCH 03/10] fix: lint Signed-off-by: Nemo Yu --- vortex-bench/src/spatialbench/benchmark.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs index b619a067609..6ce786ea205 100644 --- a/vortex-bench/src/spatialbench/benchmark.rs +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -79,7 +79,7 @@ impl Benchmark for SpatialBenchBenchmark { // convention). Only SF1.0 and SF10.0 are validated (like TPC-H); other scale factors return // `None`. Each vec covers Q1..Q9 — the queries that finish — and is identical for Parquet and // Vortex. Q10..Q12 are heavy spatial joins that time out, so they are left unvalidated (a - // shorter vec means the runner skips them). + // shorter vec means the runner skips them). match self.scale_factor.as_str() { "1.0" => Some(vec![0, 94, 1, 22, 258, 316691, 3, 6000000, 369, 37]), "10.0" => Some(vec![0, 994, 1, 79, 231, 3144328, 3, 60000000, 9357, 573]), From fdb08721e897f28d44b42e65e8d6e227498341ff Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Fri, 26 Jun 2026 13:33:32 -0400 Subject: [PATCH 04/10] feat: wire in the vx-benchmark Signed-off-by: Nemo Yu --- bench-orchestrator/bench_orchestrator/cli.py | 8 ++- .../bench_orchestrator/config.py | 27 +++++++- bench-orchestrator/tests/test_config.py | 26 ++++++++ vortex-bench/src/conversions.rs | 60 +++++++++++++++++- vortex-bench/src/spatialbench/datagen/wkb.rs | 62 +++++++++++++++++++ 5 files changed, 175 insertions(+), 8 deletions(-) diff --git a/bench-orchestrator/bench_orchestrator/cli.py b/bench-orchestrator/bench_orchestrator/cli.py index a9e37e70309..b9d7f9bb2ef 100644 --- a/bench-orchestrator/bench_orchestrator/cli.py +++ b/bench-orchestrator/bench_orchestrator/cli.py @@ -85,9 +85,11 @@ def run_ref_auto_complete() -> list[str]: return list(map(lambda x: x.run_id, ResultStore().list_runs(limit=None))) -def targets_from_axes(engine: str, format: str) -> tuple[list[BenchmarkTarget], list[str]]: +def targets_from_axes( + engine: str, format: str, benchmark: Benchmark | None = None +) -> tuple[list[BenchmarkTarget], list[str]]: """Resolve legacy engine/format axes into explicit benchmark targets.""" - return resolve_axis_targets(parse_engines(engine), parse_formats(format)) + return resolve_axis_targets(parse_engines(engine), parse_formats(format), benchmark) def backends_for_engines(engines: list[Engine]) -> list[Engine]: @@ -260,7 +262,7 @@ def run( targets = parse_targets_json(targets_json) warnings: list[str] = [] else: - targets, warnings = targets_from_axes(engine, format) + targets, warnings = targets_from_axes(engine, format, benchmark) except ValueError as exc: console.print(f"[red]{exc}[/red]") raise typer.Exit(1) from exc diff --git a/bench-orchestrator/bench_orchestrator/config.py b/bench-orchestrator/bench_orchestrator/config.py index c597e84c6be..e358bf18f01 100644 --- a/bench-orchestrator/bench_orchestrator/config.py +++ b/bench-orchestrator/bench_orchestrator/config.py @@ -52,6 +52,7 @@ class Benchmark(Enum): POLARSIGNALS = "polarsignals" PUBLIC_BI = "public-bi" STATPOPGEN = "statpopgen" + SPATIALBENCH = "spatialbench" # Engine to supported formats mapping. @@ -72,6 +73,19 @@ class Benchmark(Enum): Engine.LANCE: [Format.LANCE], } +# Engines each benchmark can run on. Benchmarks default to *every* engine; list one here only to +# restrict it. SpatialBench's queries use DuckDB-specific `ST_*` spatial SQL that DataFusion has no +# functions for yet. +BENCHMARK_ENGINES: dict[Benchmark, frozenset[Engine]] = { + Benchmark.SPATIALBENCH: frozenset({Engine.DUCKDB}), +} + + +def engines_for_benchmark(benchmark: Benchmark) -> frozenset[Engine]: + """Return the engines `benchmark` supports, defaulting to every engine when unrestricted.""" + return BENCHMARK_ENGINES.get(benchmark, frozenset(Engine)) + + T = TypeVar("T") @@ -175,13 +189,16 @@ def parse_formats_json(value: str) -> list[Format]: def resolve_axis_targets( - engines: Iterable[Engine], formats: Iterable[Format] + engines: Iterable[Engine], formats: Iterable[Format], benchmark: Benchmark | None = None ) -> tuple[list[BenchmarkTarget], list[str]]: """Expand engine/format axes into supported explicit targets.""" warnings: list[str] = [] targets: list[BenchmarkTarget] = [] for engine in engines: + if benchmark is not None and engine not in engines_for_benchmark(benchmark): + warnings.append(f"Benchmark {benchmark.value} does not support engine {engine.value}") + continue for fmt in formats: target = BenchmarkTarget(engine=engine, format=fmt).normalized() if not target.is_supported(): @@ -200,7 +217,9 @@ def group_targets_by_backend(targets: Iterable[BenchmarkTarget]) -> dict[Engine, return groups -def validate_targets(targets: Iterable[BenchmarkTarget], options: dict[str, str]) -> list[str]: +def validate_targets( + targets: Iterable[BenchmarkTarget], options: dict[str, str], benchmark: Benchmark | None = None +) -> list[str]: """Validate explicit targets against benchmark runner constraints.""" errors: list[str] = [] @@ -208,6 +227,8 @@ def validate_targets(targets: Iterable[BenchmarkTarget], options: dict[str, str] for target in normalized_targets: if not target.is_supported(): errors.append(f"Format {target.format.value} is not supported by engine {target.engine.value}") + if benchmark is not None and target.engine not in engines_for_benchmark(benchmark): + errors.append(f"Benchmark {benchmark.value} does not support engine {target.engine.value}") if options.get("remote-data-dir") and any(target.format == Format.LANCE for target in normalized_targets): errors.append("Lance format is not supported for remote storage benchmarks.") @@ -242,7 +263,7 @@ def backends(self) -> list[Engine]: def validate(self) -> list[str]: """Validate the configuration and return any errors.""" - return validate_targets(self.targets, self.options) + return validate_targets(self.targets, self.options, self.benchmark) @dataclass diff --git a/bench-orchestrator/tests/test_config.py b/bench-orchestrator/tests/test_config.py index c7e2d6bb291..f900048f87b 100644 --- a/bench-orchestrator/tests/test_config.py +++ b/bench-orchestrator/tests/test_config.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright the Vortex contributors from bench_orchestrator.config import ( + Benchmark, BenchmarkTarget, Engine, Format, @@ -39,6 +40,31 @@ def test_resolve_axis_targets_filters_unsupported_combinations() -> None: assert warnings == ["Format arrow is not supported by engine duckdb"] +def test_resolve_axis_targets_skips_engines_a_benchmark_cannot_run() -> None: + # SpatialBench is DuckDB-only (ST_* spatial SQL), so the DataFusion axis is dropped with a warning. + targets, warnings = resolve_axis_targets( + [Engine.DATAFUSION, Engine.DUCKDB], + [Format.PARQUET, Format.VORTEX], + Benchmark.SPATIALBENCH, + ) + + assert targets == [ + BenchmarkTarget(engine=Engine.DUCKDB, format=Format.PARQUET), + BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX), + ] + assert warnings == ["Benchmark spatialbench does not support engine datafusion"] + + +def test_validate_targets_rejects_engine_a_benchmark_cannot_run() -> None: + errors = validate_targets( + [BenchmarkTarget(engine=Engine.DATAFUSION, format=Format.PARQUET)], + {}, + Benchmark.SPATIALBENCH, + ) + + assert errors == ["Benchmark spatialbench does not support engine datafusion"] + + def test_validate_targets_rejects_remote_lance() -> None: errors = validate_targets( [BenchmarkTarget(engine=Engine.DATAFUSION, format=Format.LANCE)], diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 3f21ab30ba0..d618d937fb0 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -4,6 +4,7 @@ use std::fs; use std::path::Path; use std::path::PathBuf; +use std::sync::Arc; use futures::StreamExt; use futures::TryStreamExt; @@ -26,11 +27,19 @@ use vortex::array::arrow::FromArrowArray; use vortex::array::builders::builder_with_capacity; use vortex::array::stream::ArrayStreamAdapter; use vortex::array::stream::ArrayStreamExt; +use vortex::compressor::BtrBlocksCompressorBuilder; use vortex::dtype::DType; +use vortex::dtype::FieldPath; use vortex::dtype::arrow::FromArrowType; use vortex::error::VortexResult; use vortex::error::vortex_err; +use vortex::file::VortexWriteOptions; use vortex::file::WriteOptionsSessionExt; +use vortex::file::WriteStrategyBuilder; +use vortex::layout::LayoutStrategy; +use vortex::layout::layouts::chunked::writer::ChunkedLayoutStrategy; +use vortex::layout::layouts::compressed::CompressingStrategy; +use vortex::layout::layouts::flat::writer::FlatLayoutStrategy; use vortex::session::VortexSession; use crate::CompactionStrategy; @@ -126,8 +135,7 @@ pub async fn convert_parquet_file_to_vortex( .open(output_path) .await?; - compaction - .apply_options(SESSION.write_options()) + write_options_for(compaction, &dtype, is_spatialbench(parquet_path)) .write( &mut output_file, ArrayStreamExt::boxed(ArrayStreamAdapter::new(dtype, stream)), @@ -137,6 +145,54 @@ pub async fn convert_parquet_file_to_vortex( Ok(()) } +/// Whether `path` points at SpatialBench data. +fn is_spatialbench(path: &Path) -> bool { + path.components() + .any(|component| component.as_os_str() == "spatialbench") +} + +/// Vortex write options for converting `dtype`-shaped data. +/// +/// For SpatialBench (`skip_binary_dict`), the geometry blobs are large and +/// unique, so the dictionary builder balloons memory (tens of GB) for zero gain. +fn write_options_for( + compaction: CompactionStrategy, + dtype: &DType, + skip_binary_dict: bool, +) -> VortexWriteOptions { + let binary_fields: Vec<_> = match dtype { + DType::Struct(fields, _) if skip_binary_dict => fields + .names() + .iter() + .zip(fields.fields()) + .filter(|(_, field)| matches!(field, DType::Binary(_))) + .map(|(name, _)| name.clone()) + .collect(), + _ => Vec::new(), + }; + if binary_fields.is_empty() { + return compaction.apply_options(SESSION.write_options()); + } + + let mut builder = WriteStrategyBuilder::default(); + if matches!(compaction, CompactionStrategy::Compact) { + builder = + builder.with_btrblocks_builder(BtrBlocksCompressorBuilder::default().with_compact()); + } + for name in binary_fields { + builder = builder.with_field_writer(FieldPath::from_name(name), no_dict_layout()); + } + SESSION.write_options().with_strategy(builder.build()) +} + +/// A chunked + compressed layout that skips dictionary encoding for opaque `Binary` blobs. +fn no_dict_layout() -> Arc { + Arc::new(CompressingStrategy::new( + ChunkedLayoutStrategy::new(FlatLayoutStrategy::default()), + BtrBlocksCompressorBuilder::default().build(), + )) +} + /// Convert all Parquet files in a directory to Vortex format. /// /// This function reads Parquet files from `{input_path}/parquet/` and writes Vortex files to diff --git a/vortex-bench/src/spatialbench/datagen/wkb.rs b/vortex-bench/src/spatialbench/datagen/wkb.rs index 56f2d092d88..47309d52626 100644 --- a/vortex-bench/src/spatialbench/datagen/wkb.rs +++ b/vortex-bench/src/spatialbench/datagen/wkb.rs @@ -5,9 +5,11 @@ //! Geometry is emitted as WKB, which DuckDB reads directly as `GEOMETRY` via `ST_GeomFromWKB`. use std::fs; +use std::path::Path; use std::path::PathBuf; use std::sync::Arc; +use anyhow::Context; use anyhow::Result; // spatialbench emits arrow-56 batches, so they must be written with its matching arrow-56 // parquet crate, not the workspace's arrow-58 one. The parquet file itself is version-neutral. @@ -22,7 +24,9 @@ use spatialbench_parquet::arrow::AsyncArrowWriter; use spatialbench_parquet::basic::Compression; use spatialbench_parquet::file::properties::WriterProperties; use tokio::fs::File as TokioFile; +use tokio::process::Command; use tracing::info; +use tracing::warn; use super::table::TABLES; use super::table::Table; @@ -103,5 +107,63 @@ pub async fn generate_tables(scale_factor: &str, output_dir: PathBuf) -> Result< } } + // `zone` isn't one of the in-process `TABLES`; it comes from the upstream CLI. Best-effort: + // a missing/failed CLI shouldn't block the zone-free queries, so warn and carry on. + if let Err(e) = generate_zone(scale_factor, &parquet_dir).await { + warn!( + error = %e, + "zone table not generated — SpatialBench queries Q2/Q4/Q6/Q10/Q11 need it. Install the \ + upstream generator (`cargo install --path /spatialbench-cli`) or \ + set SPATIALBENCH_CLI to its binary, then re-run." + ); + } + + Ok(()) +} + +/// Generate the externally-sourced `zone` table by shelling out to the upstream `spatialbench-cli`. +async fn generate_zone(scale_factor: f64, parquet_dir: &Path) -> Result<()> { + if parquet_dir.join("zone_0.parquet").exists() { + return Ok(()); + } + let cli = std::env::var("SPATIALBENCH_CLI").unwrap_or_else(|_| "spatialbench-cli".to_string()); + + // Generate into a scratch dir so the CLI's `zone.parquet` name can't collide with the base + // tables, then move the produced parts into place as `zone_{part}.parquet`. + // Start from an empty scratch dir (clear any leftover from an interrupted run). + let scratch = parquet_dir.join(".zone-scratch"); + fs::remove_dir_all(&scratch).ok(); + fs::create_dir_all(&scratch)?; + + info!( + scale_factor, + cli, "Generating SpatialBench zone table via spatialbench-cli" + ); + let status = Command::new(&cli) + .arg("-s") + .arg(scale_factor.to_string()) + .args(["-T", "zone", "-f", "parquet", "-o"]) + .arg(&scratch) + .status() + .await + .with_context(|| format!("failed to spawn `{cli}` (is it installed / on PATH?)"))?; + anyhow::ensure!( + status.success(), + "`{cli}` exited with {status} while generating zone" + ); + + // The CLI writes `zone.parquet` (single part) or `zone/zone.N.parquet`. + let mut produced: Vec = glob::glob(&scratch.join("**/*.parquet").to_string_lossy())? + .collect::>()?; + produced.sort(); + anyhow::ensure!( + !produced.is_empty(), + "`{cli}` produced no zone parquet under {}", + scratch.display() + ); + for (part_idx, src) in produced.iter().enumerate() { + fs::rename(src, parquet_dir.join(format!("zone_{part_idx}.parquet")))?; + } + fs::remove_dir_all(&scratch).ok(); Ok(()) } From ebaa79a4925fdffddadb89b5d0fb2e5d4eeb138b Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Fri, 26 Jun 2026 17:26:16 -0400 Subject: [PATCH 05/10] feat: support geo multipolygon Signed-off-by: Nemo Yu --- Cargo.lock | 14 + Cargo.toml | 1 + vortex-geo/Cargo.toml | 1 + vortex-geo/src/extension/mod.rs | 4 + vortex-geo/src/extension/multipolygon.rs | 372 +++++++++++++++++++++++ vortex-geo/src/extension/point.rs | 52 +++- vortex-geo/src/extension/polygon.rs | 48 ++- vortex-geo/src/lib.rs | 4 + 8 files changed, 482 insertions(+), 14 deletions(-) create mode 100644 vortex-geo/src/extension/multipolygon.rs diff --git a/Cargo.lock b/Cargo.lock index 1ad8b82a5b4..fe315a74b7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4238,6 +4238,19 @@ dependencies = [ "wkt 0.14.0", ] +[[package]] +name = "geoarrow-cast" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c308d653690a4e8ef3cbba69696056bd819e624766ece66d64cc26a638acc1" +dependencies = [ + "arrow-schema 58.3.0", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", + "wkt 0.14.0", +] + [[package]] name = "geoarrow-schema" version = "0.8.0" @@ -10394,6 +10407,7 @@ dependencies = [ "geo-traits", "geo-types", "geoarrow", + "geoarrow-cast", "prost 0.14.4", "rstest", "vortex-array", diff --git a/Cargo.toml b/Cargo.toml index 903a6a46b10..f3c95e1b6c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -163,6 +163,7 @@ geo = "0.31.0" geo-traits = "0.3.0" geo-types = "0.7.19" geoarrow = "0.8.0" +geoarrow-cast = "0.8.0" get_dir = "0.5.0" glob = "0.3.2" goldenfile = "1" diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index e2f7e4dc10f..2f0583b49e6 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -20,6 +20,7 @@ geo = { workspace = true } geo-traits = { workspace = true } geo-types = { workspace = true } geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } prost = { workspace = true } vortex-array = { workspace = true } vortex-error = { workspace = true } diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index 684c83bade0..5cccc489297 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors pub(crate) mod coordinate; +mod multipolygon; mod point; mod polygon; mod wkb; @@ -12,6 +13,7 @@ use std::sync::Arc; use geo_types::Geometry; use geoarrow::datatypes::Crs; use geoarrow::datatypes::Metadata; +pub use multipolygon::*; pub use point::*; pub use polygon::*; use vortex_array::ArrayRef; @@ -46,6 +48,8 @@ pub(crate) fn geometries( point_geometries(&storage, ctx) } else if ext.is::() { polygon_geometries(&storage, ctx) + } else if ext.is::() { + multipolygon_geometries(&storage, ctx) } else { vortex_bail!("geo: unsupported geometry extension {}", array.dtype()) } diff --git a/vortex-geo/src/extension/multipolygon.rs b/vortex-geo/src/extension/multipolygon.rs new file mode 100644 index 00000000000..82fe081b316 --- /dev/null +++ b/vortex-geo/src/extension/multipolygon.rs @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The [`MultiPolygon`] extension type (`vortex.geo.multipolygon`), stored as +//! `List>>>` (polygons → rings → coordinates) and tagged with +//! [`GeoMetadata`]. A single `Polygon` is a one-element multipolygon. + +use std::sync::Arc; + +use arrow_array::ArrayRef as ArrowArrayRef; +use arrow_schema::DataType; +use arrow_schema::Field; +use arrow_schema::extension::ExtensionType; +use geo_traits::to_geo::ToGeoGeometry; +use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; +use geoarrow::array::GeoArrowArrayAccessor; +use geoarrow::array::IntoArrow; +use geoarrow::array::MultiPolygonArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; +use geoarrow::datatypes::MultiPolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; +use prost::Message; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrow::ArrowExport; +use vortex_array::arrow::ArrowExportVTable; +use vortex_array::arrow::ArrowImport; +use vortex_array::arrow::ArrowImportVTable; +use vortex_array::arrow::ArrowSession; +use vortex_array::arrow::ArrowSessionExt; +use vortex_array::arrow::FromArrowArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::arrow::FromArrowType; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtId; +use vortex_array::dtype::extension::ExtVTable; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_session::registry::CachedId; +use vortex_session::registry::Id; + +use super::GeoMetadata; +use super::coordinate::Dimension; +use super::coordinate::coordinate_dimension; +use super::coordinate::coordinate_storage_dtype; +use super::geo_metadata_from_arrow; +use super::geoarrow_metadata; + +/// A multipolygon (`geoarrow.multipolygon`); a single `Polygon` is a one-element multipolygon. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct MultiPolygon; + +impl ExtVTable for MultiPolygon { + type Metadata = GeoMetadata; + // No cheap owned value like Point's `Coordinate`; expose the raw storage scalar. + type NativeValue<'a> = &'a ScalarValue; + + fn id(&self) -> ExtId { + ExtId::new_static("vortex.geo.multipolygon") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + Ok(metadata.encode_to_vec()) + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + Ok(GeoMetadata::decode(metadata)?) + } + + fn validate_dtype(ext_dtype: &ExtDType) -> VortexResult<()> { + multipolygon_dimension(ext_dtype.storage_dtype()).map(|_| ()) + } + + fn unpack_native<'a>( + _ext_dtype: &'a ExtDType, + storage_value: &'a ScalarValue, + ) -> VortexResult<&'a ScalarValue> { + Ok(storage_value) + } +} + +/// Storage `List>>`: polygons → rings → coordinates. +pub(crate) fn multipolygon_storage_dtype(dim: Dimension, nullability: Nullability) -> DType { + let coords = coordinate_storage_dtype(dim, Nullability::NonNullable); + let ring = DType::List(Arc::new(coords), Nullability::NonNullable); + let polygon = DType::List(Arc::new(ring), Nullability::NonNullable); + DType::List(Arc::new(polygon), nullability) +} + +/// Validate `dtype` is `List>>` and return its [`Dimension`]. +pub(crate) fn multipolygon_dimension(dtype: &DType) -> VortexResult { + let DType::List(polygon, _) = dtype else { + vortex_bail!("multipolygon storage must be a List of polygons, was {dtype}"); + }; + let DType::List(ring, _) = polygon.as_ref() else { + vortex_bail!("multipolygon polygon storage must be a List of rings, was {polygon}"); + }; + let DType::List(coords, _) = ring.as_ref() else { + vortex_bail!("multipolygon ring storage must be a List of coordinates, was {ring}"); + }; + coordinate_dimension(coords) +} + +static ARROW_MULTIPOLYGON: CachedId = CachedId::new(MultiPolygonType::NAME); + +/// The `geoarrow.multipolygon` type for `dimension`, with separated (struct) coordinates. +fn multipolygon_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> MultiPolygonType { + MultiPolygonType::new(dimension.into(), geoarrow_metadata(geo_metadata)) +} + +/// Decode storage to `geo_types` for the geo scalar functions (CRS is irrelevant to planar ops). +pub(crate) fn multipolygon_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + multipolygon_array(storage, ctx)? + .iter() + .map(|geometry| -> VortexResult> { + Ok(geometry + .ok_or_else(|| vortex_err!("geo: null geometry is not supported"))? + .map_err(|e| vortex_err!("geo: geometry access failed: {e}"))? + .to_geometry()) + }) + .collect() +} + +/// Build a geoarrow `MultiPolygonArray` from the `MultiPolygon` storage. +fn multipolygon_array( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let multipolygon_type = multipolygon_type( + &GeoMetadata::default(), + multipolygon_dimension(storage.dtype())?, + ); + let session = ctx.session().clone(); + let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; + MultiPolygonArray::try_from((arrow.as_ref(), multipolygon_type)) + .map_err(|e| vortex_err!("failed to construct MultiPolygonArray: {e}")) +} + +/// A validated `MultiPolygon` array (`try_from` checks the extension type). +pub struct MultiPolygonData(ExtensionArray); + +impl TryFrom for MultiPolygonData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a MultiPolygon extension array" + ); + Ok(MultiPolygonData(ext)) + } +} + +impl MultiPolygonData { + /// Serialize multipolygons to WKB (a view array) via geoarrow's cast — the form DuckDB + /// `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let multipolygons = multipolygon_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&multipolygons, &wkb_type) + .map_err(|e| vortex_err!("failed to cast multipolygons to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + +impl ArrowExportVTable for MultiPolygon { + fn arrow_ext_id(&self) -> Id { + *ARROW_MULTIPOLYGON + } + + fn vortex_id(&self) -> Id { + self.id() + } + + fn to_arrow_field( + &self, + name: &str, + dtype: &DType, + session: &ArrowSession, + ) -> VortexResult> { + let ext_type = dtype.as_extension(); + let geo_metadata = ext_type.metadata::(); + let dimension = multipolygon_dimension(ext_type.storage_dtype())?; + + let mut field = session.to_arrow_field(name, ext_type.storage_dtype())?; + field.try_with_extension_type(multipolygon_type(geo_metadata, dimension))?; + + Ok(Some(field)) + } + + fn execute_arrow( + &self, + array: ArrayRef, + target: &Field, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let is_multipolygon = array + .dtype() + .as_extension_opt() + .map(|ext| ext.is::()) + .unwrap_or(false); + if !is_multipolygon { + return Ok(ArrowExport::Unsupported(array)); + } + + let Ok(multipolygon_meta) = target.try_extension_type::() else { + return Ok(ArrowExport::Unsupported(array)); + }; + if multipolygon_meta.coord_type() != CoordType::Separated { + return Ok(ArrowExport::Unsupported(array)); + } + + let executed = array.execute::(ctx)?; + let storage = executed.storage_array().clone(); + + let storage_field = Field::new( + String::new(), + target.data_type().clone(), + target.is_nullable(), + ); + let session = ctx.session().clone(); + let arrow_storage = session + .arrow() + .execute_arrow(storage, Some(&storage_field), ctx)?; + + // Round-trip through GeoArrow's multipolygon array; `into_arrow` is concrete, so wrap in `Arc`. + let multipolygons = + MultiPolygonArray::try_from((arrow_storage.as_ref(), multipolygon_meta)) + .map_err(|e| vortex_err!("failed to construct MultiPolygonArray: {e}"))?; + + Ok(ArrowExport::Exported(Arc::new(multipolygons.into_arrow()))) + } +} + +impl ArrowImportVTable for MultiPolygon { + fn arrow_ext_id(&self) -> Id { + *ARROW_MULTIPOLYGON + } + + /// Import a `geoarrow.multipolygon` field (matched by GeoArrow name). Accepts the full + /// `MultiPolygonType`, or a metadata-less literal (name only), inferring the dimension. + fn from_arrow_field(&self, field: &Field) -> VortexResult> { + let (dimension, metadata) = + if let Ok(multipolygon_meta) = field.try_extension_type::() { + vortex_ensure!( + multipolygon_meta.coord_type() == CoordType::Separated, + "geoarrow.multipolygon with interleaved coordinates is not supported; \ + re-encode with separated (struct) coordinates" + ); + ( + multipolygon_meta.dimension().into(), + geo_metadata_from_arrow(multipolygon_meta.metadata()), + ) + } else { + // Literal: peel the three `List` layers to the coordinate struct and read its + // dimension from the field names (the canonical check rejects nullable coordinates). + if field.extension_type_name() != Some(MultiPolygonType::NAME) { + return Ok(None); + } + let DType::List(polygon, _) = DType::from_arrow(field) else { + return Ok(None); + }; + let DType::List(ring, _) = polygon.as_ref() else { + return Ok(None); + }; + let DType::List(coords, _) = ring.as_ref() else { + return Ok(None); + }; + let DType::Struct(fields, _) = coords.as_ref() else { + return Ok(None); + }; + let Ok(dimension) = Dimension::from_field_names(fields.names()) else { + return Ok(None); + }; + (dimension, GeoMetadata::default()) + }; + + let storage_dtype = multipolygon_storage_dtype(dimension, field.is_nullable().into()); + Ok(Some(DType::Extension( + ExtDType::try_with_vtable(MultiPolygon, metadata, storage_dtype)?.erased(), + ))) + } + + fn from_arrow_array( + &self, + array: ArrowArrayRef, + field: &Field, + dtype: &DType, + ) -> VortexResult { + let Some(ext_dtype) = dtype.as_extension_opt() else { + return Ok(ArrowImport::Unsupported(array)); + }; + if !ext_dtype.is::() + || field.try_extension_type::().is_err() + || !matches!(array.data_type(), DataType::List(_)) + { + return Ok(ArrowImport::Unsupported(array)); + } + + let storage = ArrayRef::from_arrow(array.as_ref(), field.is_nullable())?; + Ok(ArrowImport::Imported( + ExtensionArray::try_new(ext_dtype.clone(), storage)?.into_array(), + )) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_array::dtype::extension::ExtDType; + use vortex_error::VortexResult; + + use super::MultiPolygon; + use super::multipolygon_storage_dtype; + use crate::extension::GeoMetadata; + use crate::extension::coordinate::Dimension; + use crate::extension::coordinate::coordinate_storage_dtype; + + fn geo_meta() -> GeoMetadata { + GeoMetadata { + crs: Some("EPSG:4326".to_string()), + } + } + + /// `MultiPolygon` accepts the canonical `List>>` storage of every + /// dimension. + #[rstest] + #[case::xy(Dimension::Xy)] + #[case::xyz(Dimension::Xyz)] + #[case::xym(Dimension::Xym)] + #[case::xyzm(Dimension::Xyzm)] + fn multipolygon_validates_every_dimension(#[case] dim: Dimension) -> VortexResult<()> { + let storage = multipolygon_storage_dtype(dim, Nullability::NonNullable); + ExtDType::::try_new(geo_meta(), storage)?; + Ok(()) + } + + /// Non-multipolygon storage is rejected at dtype construction: a bare struct (point) and a + /// double list (polygon) both fail. + #[test] + fn multipolygon_rejects_invalid_storage() -> VortexResult<()> { + let primitive = DType::Primitive(PType::F64, Nullability::NonNullable); + assert!(ExtDType::::try_new(geo_meta(), primitive).is_err()); + + // A double list (polygon) is not a multipolygon. + let coords = coordinate_storage_dtype(Dimension::Xy, Nullability::NonNullable); + let ring = DType::List(Arc::new(coords), Nullability::NonNullable); + let polygon = DType::List(Arc::new(ring), Nullability::NonNullable); + assert!(ExtDType::::try_new(geo_meta(), polygon).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/extension/point.rs b/vortex-geo/src/extension/point.rs index 19e33c212f5..470182fe0ed 100644 --- a/vortex-geo/src/extension/point.rs +++ b/vortex-geo/src/extension/point.rs @@ -12,11 +12,15 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PointArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PointType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -37,6 +41,7 @@ use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::Scalar; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_error::vortex_err; @@ -96,20 +101,51 @@ fn point_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PointType { PointType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } -/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. -pub(crate) fn point_geometries( - storage: &ArrayRef, - ctx: &mut ExecutionCtx, -) -> VortexResult>> { +pub struct PointData(ExtensionArray); + +impl TryFrom for PointData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Point extension array" + ); + Ok(PointData(ext)) + } +} + +impl PointData { + /// Serialize points to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let points = point_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&points, &wkb_type) + .map_err(|e| vortex_err!("failed to cast points to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + +/// Build a geoarrow `PointArray` from a `Point`'s `Struct` storage, shared by WKB export +/// and `geo_types` decoding. +fn point_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let point_type = point_type( &GeoMetadata::default(), coordinate_dimension(storage.dtype())?, ); let session = ctx.session().clone(); let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let points = PointArray::try_from((arrow.as_ref(), point_type)) - .map_err(|e| vortex_err!("failed to construct PointArray: {e}"))?; - points + PointArray::try_from((arrow.as_ref(), point_type)) + .map_err(|e| vortex_err!("failed to construct PointArray: {e}")) +} + +/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. +pub(crate) fn point_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + point_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry diff --git a/vortex-geo/src/extension/polygon.rs b/vortex-geo/src/extension/polygon.rs index fc06ce59bd3..8d8a88fea17 100644 --- a/vortex-geo/src/extension/polygon.rs +++ b/vortex-geo/src/extension/polygon.rs @@ -13,11 +13,15 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PolygonArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -38,6 +42,7 @@ use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; @@ -117,12 +122,7 @@ pub(crate) fn polygon_geometries( storage: &ArrayRef, ctx: &mut ExecutionCtx, ) -> VortexResult>> { - let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); - let session = ctx.session().clone(); - let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let polygons = PolygonArray::try_from((arrow.as_ref(), polygon_type)) - .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}"))?; - polygons + polygon_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry @@ -133,6 +133,42 @@ pub(crate) fn polygon_geometries( .collect() } +/// Build a geoarrow `PolygonArray` from a `Polygon`'s `List>` storage. +fn polygon_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); + let session = ctx.session().clone(); + let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; + PolygonArray::try_from((arrow.as_ref(), polygon_type)) + .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}")) +} + +/// A validated `Polygon` array (`try_from` checks the extension type). +pub struct PolygonData(ExtensionArray); + +impl TryFrom for PolygonData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Polygon extension array" + ); + Ok(PolygonData(ext)) + } +} + +impl PolygonData { + /// Serialize polygons to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let polygons = polygon_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&polygons, &wkb_type) + .map_err(|e| vortex_err!("failed to cast polygons to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + impl ArrowExportVTable for Polygon { fn arrow_ext_id(&self) -> Id { *ARROW_POLYGON diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index 951d93b7b4f..2cc8004efc5 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -8,6 +8,7 @@ use vortex_array::dtype::session::DTypeSessionExt; use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; +use crate::extension::MultiPolygon; use crate::extension::Point; use crate::extension::Polygon; use crate::extension::WellKnownBinary; @@ -32,6 +33,9 @@ pub fn initialize(session: &VortexSession) { session.dtypes().register(Polygon); session.arrow().register_exporter(Arc::new(Polygon)); session.arrow().register_importer(Arc::new(Polygon)); + session.dtypes().register(MultiPolygon); + session.arrow().register_exporter(Arc::new(MultiPolygon)); + session.arrow().register_importer(Arc::new(MultiPolygon)); // Register the geometry scalar functions. session.scalar_fns().register(GeoDistance); From a3c046b2b21b27ab4069c28729aa626c9fca52ae Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Fri, 26 Jun 2026 17:47:35 -0400 Subject: [PATCH 06/10] test: add tests for multipolygon Signed-off-by: Nemo Yu --- vortex-geo/src/tests/mod.rs | 1 + vortex-geo/src/tests/multipolygon.rs | 94 ++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 vortex-geo/src/tests/multipolygon.rs diff --git a/vortex-geo/src/tests/mod.rs b/vortex-geo/src/tests/mod.rs index 546de758eba..87b25ed1293 100644 --- a/vortex-geo/src/tests/mod.rs +++ b/vortex-geo/src/tests/mod.rs @@ -4,6 +4,7 @@ //! Arrow interop tests for the geospatial extension types, exercising the session wiring set up //! by [`crate::initialize`]. +mod multipolygon; mod point; mod wkb; diff --git a/vortex-geo/src/tests/multipolygon.rs b/vortex-geo/src/tests/multipolygon.rs new file mode 100644 index 00000000000..38f2543a96a --- /dev/null +++ b/vortex-geo/src/tests/multipolygon.rs @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Arrow interop for the `vortex.geo.multipolygon` extension type (`geoarrow.multipolygon`). + +use std::sync::Arc; + +use arrow_schema::DataType; +use arrow_schema::Field; +use arrow_schema::extension::ExtensionType as _; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension as GeoArrowDimension; +use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::MultiPolygonType; +use vortex_array::arrow::ArrowSessionExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_error::VortexResult; + +use super::SESSION; +use crate::extension::MultiPolygon; + +/// A `geoarrow.multipolygon` Arrow field with separated (struct) XY coordinates. +fn multipolygon_field(name: &str, nullable: bool, crs: Option<&str>) -> Field { + let crs = crs + .map(|crs| Crs::from_unknown_crs_type(crs.to_string())) + .unwrap_or_default(); + let metadata = Arc::new(Metadata::new(crs, None)); + MultiPolygonType::new(GeoArrowDimension::XY, metadata).to_field(name, nullable) +} + +/// An imported `geoarrow.multipolygon` field maps to the MultiPolygon extension dtype, recovering the +/// CRS, the `List>>>` storage, and nullability. +#[test] +fn import_field_recovers_extension() -> VortexResult<()> { + let field = multipolygon_field("geom", true, Some("EPSG:4326")); + let dtype = SESSION.arrow().from_arrow_field(&field)?; + + let DType::Extension(ext) = &dtype else { + panic!("expected Extension dtype, got {dtype}"); + }; + assert!(ext.is::()); + assert_eq!( + ext.metadata::().crs.as_deref(), + Some("EPSG:4326") + ); + + // Storage peels three List layers (multipolygon → polygons → rings) to the coordinate struct. + let DType::List(polygons, nullability) = ext.storage_dtype() else { + panic!("expected List storage, got {}", ext.storage_dtype()); + }; + assert_eq!(*nullability, Nullability::Nullable); + let DType::List(rings, _) = polygons.as_ref() else { + panic!("expected List of polygons"); + }; + let DType::List(coords, _) = rings.as_ref() else { + panic!("expected List of rings"); + }; + let DType::Struct(fields, _) = coords.as_ref() else { + panic!("expected coordinate Struct"); + }; + let names: Vec<&str> = fields.names().iter().map(|n| n.as_ref()).collect(); + assert_eq!(names, vec!["x", "y"]); + Ok(()) +} + +/// A field with interleaved (`FixedSizeList`) coordinates fails to import. +#[test] +fn import_interleaved_field_fails() { + let multipolygon_type = MultiPolygonType::new(GeoArrowDimension::XY, Default::default()) + .with_coord_type(CoordType::Interleaved); + let field = multipolygon_type.to_field("geom", false); + assert!(SESSION.arrow().from_arrow_field(&field).is_err()); +} + +/// A field imported to the MultiPolygon dtype and exported back carries the `geoarrow.multipolygon` +/// extension over its `List` storage. +#[test] +fn export_field_carries_extension() -> VortexResult<()> { + let imported = + SESSION + .arrow() + .from_arrow_field(&multipolygon_field("geom", false, Some("EPSG:4326")))?; + let field = SESSION.arrow().to_arrow_field("geom", &imported)?; + + assert_eq!(field.extension_type_name(), Some(MultiPolygonType::NAME)); + assert!( + matches!(field.data_type(), DataType::List(_)), + "expected List storage, got {}", + field.data_type() + ); + Ok(()) +} From 5f88e4a4dbb0956b5c62b3ffd5071d945d2baeb7 Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Mon, 29 Jun 2026 09:46:47 -0400 Subject: [PATCH 07/10] feat: wire vortex-native into duckdb-bench Signed-off-by: Nemo Yu --- Cargo.lock | 3 + benchmarks/datafusion-bench/src/lib.rs | 7 +- benchmarks/duckdb-bench/src/lib.rs | 5 +- benchmarks/duckdb-bench/src/main.rs | 4 +- vortex-bench/Cargo.toml | 3 + vortex-bench/src/benchmark.rs | 20 +++ vortex-bench/src/lib.rs | 22 ++- vortex-bench/src/spatialbench/benchmark.rs | 120 +++++++++++-- vortex-bench/src/spatialbench/datagen/mod.rs | 7 +- .../src/spatialbench/datagen/native.rs | 165 ++++++++++++++++++ .../src/spatialbench/datagen/table.rs | 55 +++++- vortex-duckdb/src/convert/dtype.rs | 14 +- vortex-duckdb/src/exporter/extension.rs | 18 ++ vortex-duckdb/src/exporter/geo.rs | 34 ++++ 14 files changed, 445 insertions(+), 32 deletions(-) create mode 100644 vortex-bench/src/spatialbench/datagen/native.rs diff --git a/Cargo.lock b/Cargo.lock index fe315a74b7c..490623bc6c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9897,6 +9897,8 @@ dependencies = [ "bzip2", "clap", "futures", + "geoarrow", + "geoarrow-cast", "get_dir", "glob", "humansize", @@ -9930,6 +9932,7 @@ dependencies = [ "url", "uuid", "vortex", + "vortex-geo", "vortex-tensor", ] diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index 1100d38d24e..c45aa99be2c 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -111,10 +111,9 @@ pub fn format_to_df_format(format: Format) -> Arc { Format::Csv => Arc::new(CsvFormat::default()) as _, Format::Arrow => Arc::new(ArrowFormat), Format::Parquet => Arc::new(ParquetFormat::new()), - Format::OnDiskVortex | Format::VortexCompact => Arc::new(VortexFormat::new_with_options( - SESSION.clone(), - vortex_table_options(), - )), + Format::OnDiskVortex | Format::VortexCompact | Format::VortexNative => Arc::new( + VortexFormat::new_with_options(SESSION.clone(), vortex_table_options()), + ), Format::OnDiskDuckDB | Format::Lance => { unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`") } diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index 4ec1efd0993..bf64f123956 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -169,7 +169,10 @@ impl DuckClient { file_format: Format, ) -> Result<()> { let object_type = match file_format { - Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW", + Format::Parquet + | Format::OnDiskVortex + | Format::VortexCompact + | Format::VortexNative => "VIEW", Format::OnDiskDuckDB => "TABLE", Format::Lance => { anyhow::bail!( diff --git a/benchmarks/duckdb-bench/src/main.rs b/benchmarks/duckdb-bench/src/main.rs index cf4fa071067..95d43cce954 100644 --- a/benchmarks/duckdb-bench/src/main.rs +++ b/benchmarks/duckdb-bench/src/main.rs @@ -142,6 +142,7 @@ fn main() -> anyhow::Result<()> { // OnDiskDuckDB tables are created during register_tables by loading from Parquet _ => {} } + benchmark.prepare_format(format, &base_path).await?; } anyhow::Ok(()) @@ -197,7 +198,8 @@ fn main() -> anyhow::Result<()> { if !args.reuse { ctx.reopen()?; } - ctx.execute_query_result(query) + let query = benchmark.query_for_format(query, format); + ctx.execute_query_result(&query) }, )?; diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 96180f8bcff..55028d8dbd0 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -24,6 +24,7 @@ vortex = { workspace = true, features = [ "zstd", ] } vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex. +vortex-geo = { workspace = true } anyhow = { workspace = true } arrow-array = { workspace = true } @@ -33,6 +34,8 @@ async-trait = { workspace = true } bzip2 = { workspace = true } clap = { workspace = true, features = ["derive"] } futures = { workspace = true } +geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } get_dir = { workspace = true } glob = { workspace = true } humansize = { workspace = true } diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index fe16df5cd3d..a2eaa157e78 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -3,6 +3,8 @@ //! Core benchmark trait and types. +use std::path::Path; + use arrow_schema::Schema; use glob::Pattern; use url::Url; @@ -33,6 +35,11 @@ pub trait Benchmark: Send + Sync { /// Get all available queries for this benchmark fn queries(&self) -> anyhow::Result>; + /// Adapt a query to a specific storage `format` before execution. Default: unchanged. + fn query_for_format(&self, query: &str, _format: Format) -> String { + query.to_string() + } + /// SQL an `engine` must run before this benchmark's queries (e.g. loading engine /// extensions). Runners replay these after every (re)open. Default: none. fn engine_init_sql(&self, _engine: Engine) -> Vec { @@ -47,6 +54,13 @@ pub trait Benchmark: Send + Sync { /// call this method to ensure base data exists, then perform their own format conversion. async fn generate_base_data(&self) -> anyhow::Result<()>; + /// Prepare benchmark- and format-specific data beyond the Parquet base that + /// [`Benchmark::generate_base_data`] produced. Called once per requested format, after the base + /// data exists. Default: nothing. + async fn prepare_format(&self, _format: Format, _base_path: &Path) -> anyhow::Result<()> { + Ok(()) + } + /// Get expected row counts for validation (optional) /// If None, no validation will be performed fn expected_row_counts(&self) -> Option> { @@ -80,4 +94,10 @@ pub trait Benchmark: Send + Sync { _ = format; None } + + /// SQL projection substituted into `SELECT {..} FROM read_(..)` when registering + /// `table_name` as a DuckDB view. Defaults to `*`. + fn view_projection(&self, _table_name: &str, _format: Format) -> String { + "*".to_string() + } } diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index daef12c7e78..9bb6576fd08 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -76,8 +76,11 @@ use vortex::session::VortexSession; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -pub static SESSION: LazyLock = - LazyLock::new(|| VortexSession::default().with_tokio()); +pub static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default().with_tokio(); + vortex_geo::initialize(&session); + session +}); #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct Target { @@ -146,6 +149,9 @@ pub enum Format { #[clap(name = "vortex-compact")] #[serde(rename = "vortex-compact")] VortexCompact, + #[clap(name = "vortex-native")] + #[serde(rename = "vortex-native")] + VortexNative, #[clap(name = "duckdb")] #[serde(rename = "duckdb")] OnDiskDuckDB, @@ -185,6 +191,7 @@ impl Format { Format::Parquet => "parquet", Format::OnDiskVortex => "vortex-file-compressed", Format::VortexCompact => "vortex-compact", + Format::VortexNative => "vortex-native", Format::OnDiskDuckDB => "duckdb", Format::Lance => "lance", } @@ -197,6 +204,7 @@ impl Format { Format::Parquet => "parquet", Format::OnDiskVortex => "vortex", Format::VortexCompact => "vortex", + Format::VortexNative => "vortex", Format::OnDiskDuckDB => "duckdb", Format::Lance => "lance", } @@ -451,8 +459,16 @@ where object_type.to_lowercase() ); + let projection = benchmark.view_projection(name, load_format); + // SpatialBench's native and WKB lanes both register `trip` from the same db path but with different casts — + // so always replace views (cheap, metadata-only). Tables hold materialized data: keep them. + let create = if object_type == "VIEW" { + format!("CREATE OR REPLACE VIEW {name}") + } else { + format!("CREATE {object_type} IF NOT EXISTS {name}") + }; sql_statements.push(format!( - "CREATE {object_type} IF NOT EXISTS {name} AS SELECT * FROM read_{extension}('{base_dir}/{pattern}');\n", + "{create} AS SELECT {projection} FROM read_{extension}('{base_dir}/{pattern}');\n", )); } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs index 6ce786ea205..092c256621c 100644 --- a/vortex-bench/src/spatialbench/benchmark.rs +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -4,6 +4,7 @@ //! SpatialBench benchmark implementation use std::fs; +use std::path::Path; use url::Url; @@ -13,9 +14,13 @@ use crate::Engine; use crate::Format; use crate::TableSpec; use crate::spatialbench::datagen; +use crate::spatialbench::datagen::Table; use crate::utils::file::resolve_data_url; use crate::workspace_root; +/// Data-dir subfolder for the native-geometry Vortex files (the `vortex-native` lane). +pub const NATIVE_DIR: &str = "vortex-native"; + /// SpatialBench geospatial benchmark (Apache Sedona): a `trip` point table, `building` polygons, and /// a `customer` attribute table, queried with spatial filters and joins. `zone` polygons are sourced /// externally and registered when present. See . @@ -34,6 +39,21 @@ impl SpatialBenchBenchmark { scale_factor, }) } + + /// Tables to materialize and register: the in-process base tables (`trip`, `building`, + /// `customer`) plus the externally-sourced `zone` when its parquet is present. Shared by native + /// data-gen and table registration so both lanes cover the same set. + fn base_tables(&self) -> Vec { + let mut tables = vec![Table::Trip, Table::Building, Table::Customer]; + let zone_present = match self.data_url.to_file_path() { + Ok(base) => zone_parquet_present(&base.join(Format::Parquet.name())), + Err(()) => true, + }; + if zone_present { + tables.push(Table::Zone); + } + tables + } } #[async_trait::async_trait] @@ -58,6 +78,16 @@ impl Benchmark for SpatialBenchBenchmark { .collect()) } + /// On the `vortex-native` lane, geometry columns surface as `GEOMETRY`, so drop the + /// `ST_GeomFromWKB(..)` wrappers and let DuckDB's `spatial` extension evaluate the `ST_*` + /// predicates directly on the native geometry. + fn query_for_format(&self, query: &str, format: Format) -> String { + match format { + Format::VortexNative => strip_wkb_wrappers(query), + _ => query.to_string(), + } + } + async fn generate_base_data(&self) -> anyhow::Result<()> { if self.data_url.scheme() != "file" { return Ok(()); @@ -66,7 +96,20 @@ impl Benchmark for SpatialBenchBenchmark { .data_url .to_file_path() .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", self.data_url.as_str()))?; - datagen::generate_tables(&self.scale_factor, base_data_dir).await?; + datagen::generate_tables(&self.scale_factor, base_data_dir.clone()).await?; + Ok(()) + } + + /// The `vortex-native` lane decodes each table's WKB geometry to native GeoArrow once, into the + /// `vortex-native` dir, so its queries read DuckDB `GEOMETRY` directly. Idempotent. + async fn prepare_format(&self, format: Format, base_path: &Path) -> anyhow::Result<()> { + if format == Format::VortexNative { + let parquet_dir = base_path.join(Format::Parquet.name()); + let native_dir = base_path.join(NATIVE_DIR); + for table in self.base_tables() { + datagen::write_native_vortex(table, &parquet_dir, &native_dir).await?; + } + } Ok(()) } @@ -74,6 +117,16 @@ impl Benchmark for SpatialBenchBenchmark { &self.data_url } + /// The `vortex-native` lane reads the native-geometry Vortex dir; every other format reads its + /// own `{format}` subfolder. + fn format_path(&self, format: Format, base_url: &Url) -> anyhow::Result { + let dir = match format { + Format::VortexNative => NATIVE_DIR, + other => other.name(), + }; + Ok(base_url.join(&format!("{dir}/"))?) + } + fn expected_row_counts(&self) -> Option> { // Indexed by `query_idx` (1-based), so index 0 is a dummy and Q1's count is at index 1 (TPC-H // convention). Only SF1.0 and SF10.0 are validated (like TPC-H); other scale factors return @@ -101,22 +154,32 @@ impl Benchmark for SpatialBenchBenchmark { format!("spatialbench(sf={})", self.scale_factor) } + /// Both lanes register the same tables (WKB reads `parquet`/`vortex`, native reads + /// `vortex-native`); `zone` is externally sourced and optional, registered only when present. fn table_specs(&self) -> Vec { - let mut specs = vec![ - TableSpec::new("trip", None), - TableSpec::new("building", None), - TableSpec::new("customer", None), - ]; - // `zone` is externally sourced and optional; register it only when present so queries that - // don't need it don't fail on the missing glob. - let zone_present = match self.data_url.to_file_path() { - Ok(base) => zone_parquet_present(&base.join(Format::Parquet.name())), - Err(()) => true, - }; - if zone_present { - specs.push(TableSpec::new("zone", None)); + self.base_tables() + .iter() + .map(|table| TableSpec::new(table.name(), None)) + .collect() + } + + /// DuckDB's view star-expansion drops native `GEOMETRY` columns down to `BLOB`, so `ST_*` fail to + /// bind. Re-cast every geometry column back to `GEOMETRY` in the view's projection. + fn view_projection(&self, table_name: &str, format: Format) -> String { + if format == Format::VortexNative + && let Some(table) = Table::from_name(table_name) + { + let geometry_columns = table.geometry_columns(); + if !geometry_columns.is_empty() { + let casts = geometry_columns + .iter() + .map(|column| format!("{name}::GEOMETRY AS {name}", name = column.name)) + .collect::>() + .join(", "); + return format!("* REPLACE ({casts})"); + } } - specs + "*".to_string() } /// Scope each table to its own `{table}_*.{ext}` files; the default globs every file in the @@ -141,8 +204,33 @@ impl Benchmark for SpatialBenchBenchmark { /// Whether an externally-sourced `zone_*.parquet` exists under `parquet_dir` (generated by the /// upstream `spatialbench-cli`; see the module docs). -fn zone_parquet_present(parquet_dir: &std::path::Path) -> bool { +fn zone_parquet_present(parquet_dir: &Path) -> bool { glob::glob(&parquet_dir.join("zone_*.parquet").to_string_lossy()) .map(|mut paths| paths.next().is_some()) .unwrap_or(false) } + +/// Strip `ST_GeomFromWKB()` → `` so the native lane reads the already-`GEOMETRY` +/// column directly. Assumes the wrapped expression contains no inner `)` (true for our column refs). +fn strip_wkb_wrappers(sql: &str) -> String { + const OPEN: &str = "ST_GeomFromWKB("; + let mut out = String::with_capacity(sql.len()); + let mut rest = sql; + while let Some(pos) = rest.find(OPEN) { + out.push_str(&rest[..pos]); + let after = &rest[pos + OPEN.len()..]; + match after.find(')') { + Some(close) => { + out.push_str(&after[..close]); + rest = &after[close + 1..]; + } + // Unbalanced wrapper: emit it verbatim and stop rewriting. + None => { + out.push_str(OPEN); + rest = after; + } + } + } + out.push_str(rest); + out +} diff --git a/vortex-bench/src/spatialbench/datagen/mod.rs b/vortex-bench/src/spatialbench/datagen/mod.rs index 8ebd1b35d86..7808e06cbc3 100644 --- a/vortex-bench/src/spatialbench/datagen/mod.rs +++ b/vortex-bench/src/spatialbench/datagen/mod.rs @@ -1,11 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables (Parquet + Vortex); -//! the [`table`] catalog is the single source of truth for the base tables. +//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables; [`native`] derives +//! native-geometry Vortex files from them for `points=native`. The [`table`] catalog is the single +//! source of truth for the base tables both stages share. +pub mod native; pub mod table; pub mod wkb; +pub use native::write_native_vortex; pub use table::Table; pub use wkb::generate_tables; diff --git a/vortex-bench/src/spatialbench/datagen/native.rs b/vortex-bench/src/spatialbench/datagen/native.rs new file mode 100644 index 00000000000..1d600a8b8cf --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/native.rs @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Native-geometry prep for `points=native`: decode a table's WKB geometry to native +//! `vortex.geo.{point,polygon,multipolygon}` via `geoarrow_cast` (so Vortex never decodes WKB), then +//! write a Vortex file. A one-time cost; queries then see DuckDB `GEOMETRY` directly. + +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use arrow_array::RecordBatch; +use arrow_schema::DataType; +use arrow_schema::Schema; +use futures::TryStreamExt; +use geoarrow::array::GenericWkbArray; +use geoarrow::array::GeoArrowArray; +use geoarrow::array::WkbViewArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension; +use geoarrow::datatypes::GeoArrowType; +use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::MultiPolygonType; +use geoarrow::datatypes::PointType; +use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; +use parquet::arrow::ParquetRecordBatchStreamBuilder; +use tokio::fs::File as TokioFile; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrow::ArrowSessionExt; +use vortex::file::WriteOptionsSessionExt; + +use super::table::GeometryKind; +use super::table::Table; +use crate::SESSION; +use crate::utils::file::idempotent_async; + +fn geo_metadata() -> Arc { + Arc::new(Metadata::new(Crs::default(), None)) +} + +/// Write `{native_dir}/{table}_0.vortex` with native geometry columns from the WKB parquet. Idempotent. +pub async fn write_native_vortex( + table: Table, + parquet_dir: &Path, + native_dir: &Path, +) -> anyhow::Result { + idempotent_async( + native_dir.join(format!("{}_0.vortex", table.name())), + |path| async move { + let chunks = map_source_batches(parquet_dir, table, |b| native_chunk(b, table)).await?; + + let dtype = chunks[0].dtype().clone(); + let chunked = ChunkedArray::try_new(chunks, dtype)?.into_array(); + let mut file = TokioFile::create(&path).await?; + SESSION + .write_options() + .write(&mut file, chunked.to_array_stream()) + .await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry table"); + Ok(()) + }, + ) + .await +} + +/// Apply `f` to every batch of `table`'s base WKB parquet parts. All columns are kept; only the +/// geometry columns are rewritten to native types. +async fn map_source_batches( + parquet_dir: &Path, + table: Table, + mut f: impl FnMut(RecordBatch) -> anyhow::Result, +) -> anyhow::Result> { + let pattern = parquet_dir.join(format!("{}_*.parquet", table.name())); + let mut files: Vec = + glob::glob(&pattern.to_string_lossy())?.collect::>()?; + files.sort(); + anyhow::ensure!(!files.is_empty(), "no parquet matching {pattern:?}"); + + let mut out = Vec::new(); + for file in files { + let builder = ParquetRecordBatchStreamBuilder::new(TokioFile::open(&file).await?).await?; + let mut stream = builder.build()?; + while let Some(batch) = stream.try_next().await? { + out.push(f(batch)?); + } + } + Ok(out) +} + +/// Rewrite each of `table`'s WKB geometry columns to its native-lane type, tagging the field with +/// the matching `geoarrow.*` extension. +fn native_record_batch(batch: RecordBatch, table: Table) -> anyhow::Result { + let schema = batch.schema(); + let mut fields = schema.fields().to_vec(); + let mut columns = batch.columns().to_vec(); + + for geom in table.geometry_columns() { + let idx = schema.index_of(geom.name)?; + let column = batch.column(idx).as_ref(); + let wkb_type = WkbType::new(geo_metadata()); + + // Wrap the source WKB. SpatialBench tables emit `Binary`; the external `zone` parquet uses + // `BinaryView`. + let wkb: Box = match column.data_type() { + DataType::Binary => Box::new(GenericWkbArray::::try_from((column, wkb_type))?), + DataType::LargeBinary => { + Box::new(GenericWkbArray::::try_from((column, wkb_type))?) + } + DataType::BinaryView => Box::new(WkbViewArray::try_from((column, wkb_type))?), + other => anyhow::bail!("{}: unsupported WKB column type {other}", geom.name), + }; + + // Decode to a native, separated-XY GeoArrow type. The columnar round-trip also normalizes + // WKB endianness (Overture ships big-endian; native types carry none). + let native: Arc = match geom.kind { + GeometryKind::Point => cast( + wkb.as_ref(), + &GeoArrowType::Point( + PointType::new(Dimension::XY, geo_metadata()) + .with_coord_type(CoordType::Separated), + ), + )?, + GeometryKind::Polygon => cast( + wkb.as_ref(), + &GeoArrowType::Polygon( + PolygonType::new(Dimension::XY, geo_metadata()) + .with_coord_type(CoordType::Separated), + ), + )?, + // Polygon promotes to a one-element multipolygon, so this also covers the mixed + // `Polygon`/`MultiPolygon` zone boundaries. + GeometryKind::MultiPolygon => cast( + wkb.as_ref(), + &GeoArrowType::MultiPolygon( + MultiPolygonType::new(Dimension::XY, geo_metadata()) + .with_coord_type(CoordType::Separated), + ), + )?, + }; + + columns[idx] = native.to_array_ref(); + fields[idx] = Arc::new(native.data_type().to_field(geom.name, false)); + } + + Ok(RecordBatch::try_new( + Arc::new(Schema::new(fields)), + columns, + )?) +} + +/// Convert a WKB batch to a Vortex struct chunk with `table`'s geometry columns as native types. +fn native_chunk(batch: RecordBatch, table: Table) -> anyhow::Result { + let native_batch = native_record_batch(batch, table)?; + let native_schema = native_batch.schema(); + SESSION + .arrow() + .from_arrow_record_batch(native_batch, &native_schema) + .context("importing native batch") +} diff --git a/vortex-bench/src/spatialbench/datagen/table.rs b/vortex-bench/src/spatialbench/datagen/table.rs index c924428cc0e..c72e0907f37 100644 --- a/vortex-bench/src/spatialbench/datagen/table.rs +++ b/vortex-bench/src/spatialbench/datagen/table.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! The shared SpatialBench table catalog: one source of truth for the base tables generated by -//! [`super::wkb`]. +//! The shared SpatialBench table catalog: one source of truth for the base tables, used by both the +//! WKB generation ([`super::wkb`]) and the native geometry conversion ([`super::native`]). /// A SpatialBench base table. #[derive(Clone, Copy)] @@ -17,6 +17,20 @@ pub enum Table { /// externally (the `spatialbench` crate ships no zone generator). pub(crate) const TABLES: &[Table] = &[Table::Trip, Table::Building, Table::Customer]; +/// A geometry column and the geometry type its WKB bytes decode to. +pub(crate) struct GeometryColumn { + pub(crate) name: &'static str, + pub(crate) kind: GeometryKind, +} + +/// Geometry types a column can decode to on the native lane. +#[derive(Clone, Copy, Debug)] +pub(crate) enum GeometryKind { + Point, + Polygon, + MultiPolygon, +} + impl Table { /// File stem under a format directory, e.g. `Trip` → `trip_{part}.parquet`. pub(crate) fn name(self) -> &'static str { @@ -27,4 +41,41 @@ impl Table { Table::Zone => "zone", } } + + /// The [`Table`] for a registered table name, or `None` for an unknown name. + pub(crate) fn from_name(name: &str) -> Option
{ + match name { + "trip" => Some(Table::Trip), + "building" => Some(Table::Building), + "customer" => Some(Table::Customer), + "zone" => Some(Table::Zone), + _ => None, + } + } + + /// Geometry columns to decode from WKB to native, with their geometry type. Empty for tables with + /// no geometry (e.g. `customer`). + pub(crate) fn geometry_columns(self) -> &'static [GeometryColumn] { + match self { + Table::Trip => &[ + GeometryColumn { + name: "t_pickuploc", + kind: GeometryKind::Point, + }, + GeometryColumn { + name: "t_dropoffloc", + kind: GeometryKind::Point, + }, + ], + Table::Building => &[GeometryColumn { + name: "b_boundary", + kind: GeometryKind::Polygon, + }], + Table::Customer => &[], + Table::Zone => &[GeometryColumn { + name: "z_boundary", + kind: GeometryKind::MultiPolygon, + }], + } + } } diff --git a/vortex-duckdb/src/convert/dtype.rs b/vortex-duckdb/src/convert/dtype.rs index 4238b354182..624ced63f56 100644 --- a/vortex-duckdb/src/convert/dtype.rs +++ b/vortex-duckdb/src/convert/dtype.rs @@ -58,6 +58,9 @@ use vortex::extension::datetime::Time; use vortex::extension::datetime::TimeUnit; use vortex::extension::datetime::Timestamp; use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::MultiPolygon; +use vortex_geo::extension::Point; +use vortex_geo::extension::Polygon; use vortex_geo::extension::WellKnownBinary; use crate::cpp::DUCKDB_TYPE; @@ -245,9 +248,14 @@ impl TryFrom<&DType> for LogicalType { return temporal_to_duckdb(temporal); } - if let Some(wkb) = ext_dtype.metadata_opt::() { - let crs = wkb.crs.as_ref(); - return LogicalType::geometry_type(crs.map(|crs| crs.as_str())); + // Native Point/Polygon and WKB all surface to DuckDB as GEOMETRY so `ST_*` bind. + if let Some(geo) = ext_dtype + .metadata_opt::() + .or_else(|| ext_dtype.metadata_opt::()) + .or_else(|| ext_dtype.metadata_opt::()) + .or_else(|| ext_dtype.metadata_opt::()) + { + return LogicalType::geometry_type(geo.crs.as_deref()); } vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()); diff --git a/vortex-duckdb/src/exporter/extension.rs b/vortex-duckdb/src/exporter/extension.rs index 221dc92a85f..30eb3715cc2 100644 --- a/vortex-duckdb/src/exporter/extension.rs +++ b/vortex-duckdb/src/exporter/extension.rs @@ -8,6 +8,12 @@ use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::extension::datetime::AnyTemporal; use vortex::error::VortexResult; use vortex::error::vortex_bail; +use vortex_geo::extension::MultiPolygon; +use vortex_geo::extension::MultiPolygonData; +use vortex_geo::extension::Point; +use vortex_geo::extension::PointData; +use vortex_geo::extension::Polygon; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinary; use vortex_geo::extension::WellKnownBinaryData; @@ -27,5 +33,17 @@ pub(crate) fn new_exporter( return geo::new_wkb_exporter(WellKnownBinaryData::try_from(ext)?, ctx); } + if ext.ext_dtype().is::() { + return geo::new_point_exporter(PointData::try_from(ext)?, ctx); + } + + if ext.ext_dtype().is::() { + return geo::new_polygon_exporter(PolygonData::try_from(ext)?, ctx); + } + + if ext.ext_dtype().is::() { + return geo::new_multipolygon_exporter(MultiPolygonData::try_from(ext)?, ctx); + } + vortex_bail!("no non-temporal extension exporter") } diff --git a/vortex-duckdb/src/exporter/geo.rs b/vortex-duckdb/src/exporter/geo.rs index 1287ed019e2..fc6634c121c 100644 --- a/vortex-duckdb/src/exporter/geo.rs +++ b/vortex-duckdb/src/exporter/geo.rs @@ -4,6 +4,9 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::VarBinViewArray; use vortex::error::VortexResult; +use vortex_geo::extension::MultiPolygonData; +use vortex_geo::extension::PointData; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinaryData; use crate::exporter::ColumnExporter; @@ -17,3 +20,34 @@ pub(crate) fn new_wkb_exporter( let values = array.wkb_values().clone().execute::(ctx)?; new_exporter(values, ctx) } + +/// Create an exporter for a native `Point` column. DuckDB `GEOMETRY` vectors carry WKB, so the +/// points are serialized to WKB via [`PointData::to_wkb`] (only for rows DuckDB materializes — +/// with predicate pushdown that's just the survivors). +pub(crate) fn new_point_exporter( + point: PointData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = point.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} + +/// Create an exporter for a native `Polygon` column. Like [`new_point_exporter`], DuckDB `GEOMETRY` +/// vectors carry WKB, so the polygons are serialized to WKB via [`PolygonData::to_wkb`]. +pub(crate) fn new_polygon_exporter( + polygon: PolygonData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = polygon.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} + +/// Create an exporter for a native `MultiPolygon` column, serialized to WKB via +/// [`MultiPolygonData::to_wkb`] (see [`new_point_exporter`]). +pub(crate) fn new_multipolygon_exporter( + multipolygon: MultiPolygonData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = multipolygon.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} From 69a4f900e32d4c65e649e193bccf23aaf00965bd Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Mon, 29 Jun 2026 10:09:36 -0400 Subject: [PATCH 08/10] feat: wire geo native into vx-bench Signed-off-by: Nemo Yu --- .../bench_orchestrator/config.py | 2 ++ bench-orchestrator/tests/test_config.py | 34 +++++++++++++++++++ bench-orchestrator/tests/test_executor.py | 13 +++++++ 3 files changed, 49 insertions(+) diff --git a/bench-orchestrator/bench_orchestrator/config.py b/bench-orchestrator/bench_orchestrator/config.py index e358bf18f01..8c8275bc81e 100644 --- a/bench-orchestrator/bench_orchestrator/config.py +++ b/bench-orchestrator/bench_orchestrator/config.py @@ -35,6 +35,7 @@ class Format(Enum): PARQUET = "parquet" VORTEX = "vortex" VORTEX_COMPACT = "vortex-compact" + VORTEX_NATIVE = "vortex-native" DUCKDB = "duckdb" LANCE = "lance" @@ -68,6 +69,7 @@ class Benchmark(Enum): Format.PARQUET, Format.VORTEX, Format.VORTEX_COMPACT, + Format.VORTEX_NATIVE, Format.DUCKDB, ], Engine.LANCE: [Format.LANCE], diff --git a/bench-orchestrator/tests/test_config.py b/bench-orchestrator/tests/test_config.py index f900048f87b..a8f09c6b04b 100644 --- a/bench-orchestrator/tests/test_config.py +++ b/bench-orchestrator/tests/test_config.py @@ -26,6 +26,23 @@ def test_parse_formats_json_accepts_ci_format_arrays() -> None: assert formats == [Format.PARQUET, Format.VORTEX, Format.DUCKDB] +def test_parse_formats_json_accepts_vortex_native() -> None: + formats = parse_formats_json('["parquet","vortex","vortex-native"]') + + assert formats == [Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE] + + +def test_resolve_axis_targets_offers_vortex_native_on_duckdb_only() -> None: + # vortex-native is a DuckDB-only lane; the DataFusion axis is dropped as unsupported. + targets, warnings = resolve_axis_targets( + [Engine.DATAFUSION, Engine.DUCKDB], + [Format.VORTEX_NATIVE], + ) + + assert targets == [BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE)] + assert warnings == ["Format vortex-native is not supported by engine datafusion"] + + def test_resolve_axis_targets_filters_unsupported_combinations() -> None: targets, warnings = resolve_axis_targets( [Engine.DATAFUSION, Engine.DUCKDB], @@ -55,6 +72,23 @@ def test_resolve_axis_targets_skips_engines_a_benchmark_cannot_run() -> None: assert warnings == ["Benchmark spatialbench does not support engine datafusion"] +def test_resolve_axis_targets_expands_spatialbench_three_lanes() -> None: + # The single-command three-lane comparison: parquet, WKB vortex, and native-geometry vortex, all + # on DuckDB. + targets, warnings = resolve_axis_targets( + [Engine.DUCKDB], + [Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE], + Benchmark.SPATIALBENCH, + ) + + assert targets == [ + BenchmarkTarget(engine=Engine.DUCKDB, format=Format.PARQUET), + BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX), + BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE), + ] + assert warnings == [] + + def test_validate_targets_rejects_engine_a_benchmark_cannot_run() -> None: errors = validate_targets( [BenchmarkTarget(engine=Engine.DATAFUSION, format=Format.PARQUET)], diff --git a/bench-orchestrator/tests/test_executor.py b/bench-orchestrator/tests/test_executor.py index dd3253a22ff..4a09d35d89e 100644 --- a/bench-orchestrator/tests/test_executor.py +++ b/bench-orchestrator/tests/test_executor.py @@ -33,6 +33,19 @@ def test_build_command_adds_duckdb_cleanup_flag() -> None: assert "scale-factor=1.0" in cmd +def test_build_command_serializes_vortex_native_format() -> None: + executor = BenchmarkExecutor(Path("/tmp/duckdb-bench"), Engine.DUCKDB) + + cmd = executor.build_command( + benchmark=Benchmark.SPATIALBENCH, + formats=[Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE], + iterations=1, + options={"scale-factor": "1.0"}, + ) + + assert "parquet,vortex,vortex-native" in cmd + + def test_build_command_omits_formats_for_lance_backend() -> None: executor = BenchmarkExecutor(Path("/tmp/lance-bench"), Engine.LANCE) From 7b153516b143e3453ac2426ad8e3cc1a3e10c12b Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Mon, 29 Jun 2026 12:28:37 -0400 Subject: [PATCH 09/10] feat: support ST_DWithin pushdown in vortex Signed-off-by: Nemo Yu --- benchmarks/duckdb-bench/src/lib.rs | 10 ++ vortex-bench/src/spatialbench/benchmark.rs | 52 ++++++++- vortex-duckdb/cpp/expr.cpp | 43 ++++++++ vortex-duckdb/cpp/include/expr.h | 4 + vortex-duckdb/src/convert/expr.rs | 96 ++++++++++++++++- vortex-duckdb/src/duckdb/database.rs | 10 ++ vortex-geo/src/extension/mod.rs | 118 +++++++++++++++++++++ vortex-geo/src/scalar_fn/distance.rs | 88 ++++++++++++++- 8 files changed, 408 insertions(+), 13 deletions(-) diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index bf64f123956..4399d18a778 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -78,6 +78,11 @@ impl DuckClient { for stmt in &statements { self.connection().query(stmt)?; } + // After `LOAD spatial`, register `vortex_dwithin` so the radius filter pushes. No-op without it. + self.db + .as_ref() + .vortex_expect("DuckClient database accessed after close") + .register_geo_aliases()?; self.init_sql = statements; Ok(()) } @@ -127,6 +132,11 @@ impl DuckClient { .vortex_expect("connection just opened") .query(stmt)?; } + // Re-register `vortex_dwithin` against the fresh instance. + self.db + .as_ref() + .vortex_expect("database just opened") + .register_geo_aliases()?; Ok(()) } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs index 092c256621c..f06b76579b1 100644 --- a/vortex-bench/src/spatialbench/benchmark.rs +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -78,12 +78,12 @@ impl Benchmark for SpatialBenchBenchmark { .collect()) } - /// On the `vortex-native` lane, geometry columns surface as `GEOMETRY`, so drop the - /// `ST_GeomFromWKB(..)` wrappers and let DuckDB's `spatial` extension evaluate the `ST_*` - /// predicates directly on the native geometry. + /// Adapt a query to the storage format. The `vortex-native` lane surfaces geometry as `GEOMETRY`, + /// so it drops the `ST_GeomFromWKB(..)` wrappers and routes pushable `ST_DWithin` filters. fn query_for_format(&self, query: &str, format: Format) -> String { match format { - Format::VortexNative => strip_wkb_wrappers(query), + // Native geometry is `GEOMETRY`: drop `ST_GeomFromWKB(..)`, route pushable `ST_DWithin`. + Format::VortexNative => route_pushable_dwithin(&strip_wkb_wrappers(query)), _ => query.to_string(), } } @@ -234,3 +234,47 @@ fn strip_wkb_wrappers(sql: &str) -> String { out.push_str(rest); out } + +/// Rewrite `ST_DWithin(..)` calls with a geometry literal operand (`ST_GeomFromText`) to the +/// `vortex_dwithin` alias; leave the rest as `ST_DWithin`. `vortex_dwithin` is only correct when it +/// pushes (its bind is cleared), and only single-table filters against a literal push — a join (two +/// columns) does not, so it must keep `ST_DWithin`. +fn route_pushable_dwithin(sql: &str) -> String { + const OPEN: &str = "ST_DWithin("; + let mut out = String::with_capacity(sql.len()); + let mut rest = sql; + while let Some(pos) = rest.find(OPEN) { + out.push_str(&rest[..pos]); + let after = &rest[pos + OPEN.len()..]; + // Find this call's matching close paren, tracking nested parens (`ST_GeomFromText(..)`). + let mut depth = 1usize; + let mut end = None; + for (i, c) in after.char_indices() { + match c { + '(' => depth += 1, + ')' => { + depth -= 1; + if depth == 0 { + end = Some(i); + break; + } + } + _ => {} + } + } + match end { + Some(close) if after[..close].contains("ST_GeomFromText") => { + out.push_str("vortex_dwithin("); + out.push_str(&after[..=close]); + rest = &after[close + 1..]; + } + // No literal operand (a join) or unbalanced: keep `ST_DWithin` for DuckDB to evaluate. + _ => { + out.push_str(OPEN); + rest = after; + } + } + } + out.push_str(rest); + out +} diff --git a/vortex-duckdb/cpp/expr.cpp b/vortex-duckdb/cpp/expr.cpp index 6470a9d338d..73d2aac1355 100644 --- a/vortex-duckdb/cpp/expr.cpp +++ b/vortex-duckdb/cpp/expr.cpp @@ -11,6 +11,15 @@ #include "duckdb/planner/expression/bound_operator_expression.hpp" #include "duckdb/planner/expression/bound_conjunction_expression.hpp" +#include "duckdb/catalog/catalog.hpp" +#include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp" +#include "duckdb/main/capi/capi_internal.hpp" +#include "duckdb/main/client_context.hpp" +#include "duckdb/main/connection.hpp" +#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp" + +#include + using namespace duckdb; extern "C" const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func) { @@ -21,6 +30,40 @@ extern "C" const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func) { return func->name.c_str(); } +extern "C" duckdb_state duckdb_vx_register_geo_aliases(duckdb_database ffi_db) { + if (!ffi_db) { + return DuckDBError; + } + const DatabaseWrapper &wrapper = *reinterpret_cast(ffi_db); + try { + Connection conn(*wrapper.database->instance); + ClientContext &context = *conn.context; + context.RunFunctionInTransaction([&]() { + auto &catalog = Catalog::GetSystemCatalog(context); + auto &entry = catalog.GetEntry( + context, DEFAULT_SCHEMA, "st_dwithin"); + // Copy each ST_DWithin overload to a non-throwing `vortex_dwithin` so DuckDB will push it. + ScalarFunctionSet set("vortex_dwithin"); + for (const auto &overload : entry.functions.functions) { + ScalarFunction copy = overload; + copy.name = "vortex_dwithin"; + copy.SetErrorMode(FunctionErrors::CANNOT_ERROR); + // Clear the bind so the radius stays as children[2] for the Vortex converter + // (ST_DWithin's bind folds it into bind_data). vortex_dwithin is only pushed, never run. + copy.bind = nullptr; + set.AddFunction(copy); + } + CreateScalarFunctionInfo info(std::move(set)); + info.on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; + catalog.CreateFunction(context, info); + }); + } catch (const std::exception &) { + // No `spatial` loaded → no `ST_DWithin` to alias; nothing to register. + return DuckDBSuccess; + } + return DuckDBSuccess; +} + extern "C" const char *duckdb_vx_expr_to_string(duckdb_vx_expr ffi_expr) { if (!ffi_expr) { return nullptr; diff --git a/vortex-duckdb/cpp/include/expr.h b/vortex-duckdb/cpp/include/expr.h index 457a944e5d5..d0d76ffa3dc 100644 --- a/vortex-duckdb/cpp/include/expr.h +++ b/vortex-duckdb/cpp/include/expr.h @@ -13,6 +13,10 @@ typedef struct duckdb_vx_sfunc_ *duckdb_vx_sfunc; const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func); +/// Register `vortex_dwithin`, a non-throwing alias of the spatial extension's `ST_DWithin`, so the +/// radius filter pushes into the Vortex scan. +duckdb_state duckdb_vx_register_geo_aliases(duckdb_database ffi_db); + typedef struct duckdb_vx_expr_ *duckdb_vx_expr; /// Return the string representation of the expression. Must be freed with `duckdb_free`. diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index 324086e5775..1486f15bbd4 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -27,6 +27,7 @@ use vortex::expr::not; use vortex::expr::or_collect; use vortex::expr::root; use vortex::scalar::Scalar; +use vortex::scalar_fn::EmptyOptions; use vortex::scalar_fn::ScalarFnVTableExt; use vortex::scalar_fn::fns::between::Between; use vortex::scalar_fn::fns::between::BetweenOptions; @@ -36,6 +37,9 @@ use vortex::scalar_fn::fns::like::Like; use vortex::scalar_fn::fns::like::LikeOptions; use vortex::scalar_fn::fns::literal::Literal; use vortex::scalar_fn::fns::operators::Operator; +use vortex_geo::extension::WellKnownBinary; +use vortex_geo::extension::native_geometry_scalar_from_wkb; +use vortex_geo::scalar_fn::distance::GeoDistance; use crate::cpp::DUCKDB_VX_EXPR_TYPE; use crate::duckdb; @@ -57,6 +61,91 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult { } } +/// Read an `f64` from a constant expression (e.g. the `ST_DWithin` distance literal). +fn from_bound_f64(value: &duckdb::ExpressionRef) -> VortexResult { + match value.as_class().vortex_expect("unknown class") { + BoundConstant(constant) => f64::try_from(&Scalar::try_from(constant.value)?), + _ => vortex_bail!("Expected f64 constant, got {:?}", value.as_class_id()), + } +} + +/// Lower a geo operand: a `GEOMETRY` literal arrives as WKB, decoded once to its native type so the +/// pushed `GeoDistance` stays native; a column ref recurses. `None` (unsupported type) skips push. +fn geo_operand( + value: &duckdb::ExpressionRef, + col_sub: Option<&Expression>, +) -> VortexResult> { + if let Some(BoundConstant(constant)) = value.as_class() { + let scalar = Scalar::try_from(constant.value)?; + let DType::Extension(ext_dtype) = scalar.dtype() else { + return Ok(None); + }; + if !ext_dtype.is::() { + return Ok(None); + } + let storage = scalar.as_extension().to_storage_scalar(); + let Some(buf) = storage.as_binary_opt().and_then(|b| b.value()) else { + return Ok(None); + }; + return Ok(native_geometry_scalar_from_wkb(buf.as_slice())?.map(lit)); + } + try_from_expression_inner(value, col_sub) +} + +/// Lower geo UDFs to native Vortex geo ops so the work runs in the scan. `None` otherwise. +fn try_from_geo_function( + name: &str, + func: &BoundFunction, + col_sub: Option<&Expression>, +) -> VortexResult> { + // Catch-all for every bound function: reject non-geo names before touching the children. + if !is_geo_function(name) { + debug!("bound function {name}"); + return Ok(None); + } + let children: Vec<_> = func.children().collect(); + let expr = match name.to_ascii_lowercase().as_str() { + "vortex_dwithin" => { + if children.len() != 3 { + return Ok(None); + } + let Some(a) = geo_operand(children[0], col_sub)? else { + return Ok(None); + }; + let Some(b) = geo_operand(children[1], col_sub)? else { + return Ok(None); + }; + let distance = from_bound_f64(children[2])?; + let geo_distance = GeoDistance.new_expr(EmptyOptions, [a, b]); + Binary.new_expr(Operator::Lte, [geo_distance, lit(distance)]) + } + "st_distance" => { + if children.len() != 2 { + return Ok(None); + } + let Some(a) = geo_operand(children[0], col_sub)? else { + return Ok(None); + }; + let Some(b) = geo_operand(children[1], col_sub)? else { + return Ok(None); + }; + GeoDistance.new_expr(EmptyOptions, [a, b]) + } + _ => return Ok(None), + }; + + Ok(Some(expr)) +} + +/// Geo UDFs that `try_from_geo_function` lowers — shared with `can_push_expression` so the pushable +/// and lowered sets can't drift. +fn is_geo_function(name: &str) -> bool { + matches!( + name.to_ascii_lowercase().as_str(), + "vortex_dwithin" | "st_distance" + ) +} + fn try_from_bound_function( func: &BoundFunction, col_sub: Option<&Expression>, @@ -115,10 +204,8 @@ fn try_from_bound_function( }; Like.new_expr(LikeOptions::default(), [value, lit(pattern)]) } - _ => { - debug!("bound function {}", func.scalar_function.name()); - return Ok(None); - } + // Geo UDFs are handled here. + name => return try_from_geo_function(name, func, col_sub), }; Ok(Some(expr)) @@ -173,6 +260,7 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool { || name == "~~" || name == "!~~" || name == "strlen" + || (is_geo_function(name) && func.children().all(can_push_expression)) } ExpressionClass::BoundOperator(op) => { if !matches!( diff --git a/vortex-duckdb/src/duckdb/database.rs b/vortex-duckdb/src/duckdb/database.rs index ab86503b291..e85d3d4bc5f 100644 --- a/vortex-duckdb/src/duckdb/database.rs +++ b/vortex-duckdb/src/duckdb/database.rs @@ -90,4 +90,14 @@ impl DatabaseRef { ); Ok(()) } + + /// Register `vortex_dwithin`, a non-throwing alias of the `spatial` extension's `ST_DWithin`, so + /// the radius predicate pushes into the Vortex scan. + pub fn register_geo_aliases(&self) -> VortexResult<()> { + duckdb_try!( + unsafe { cpp::duckdb_vx_register_geo_aliases(self.as_ptr()) }, + "Failed to register geo aliases" + ); + Ok(()) + } } diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index 5cccc489297..fc2faf4c736 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -10,9 +10,21 @@ mod wkb; use std::fmt::Display; use std::sync::Arc; +use ::wkb::reader::GeometryType; +use arrow_array::BinaryArray; use geo_types::Geometry; +use geoarrow::array::GenericWkbArray; +use geoarrow::array::GeoArrowArray; +use geoarrow::datatypes::CoordType; use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::MultiPolygonType; +use geoarrow::datatypes::PointType; +use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; pub use multipolygon::*; pub use point::*; pub use polygon::*; @@ -22,6 +34,9 @@ use vortex_array::IntoArray; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrow::FromArrowArray; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -67,6 +82,63 @@ pub(crate) fn single_geometry( .ok_or_else(|| vortex_err!("geo: constant operand decoded to no geometry")) } +/// Decode a WKB geometry literal (DuckDB's wire form for `GEOMETRY` constants) to its native +/// `Point`/`Polygon`/`MultiPolygon` scalar. `None` for unsupported types. Plan-time, one value only. +pub fn native_geometry_scalar_from_wkb(bytes: &[u8]) -> VortexResult> { + let metadata = geoarrow_metadata(&GeoMetadata::default()); + let binary = BinaryArray::from(vec![Some(bytes)]); + let wkb = GenericWkbArray::::try_from(( + &binary as &dyn arrow_array::Array, + WkbType::new(Arc::clone(&metadata)), + )) + .map_err(|e| vortex_err!("failed to read WKB literal: {e}"))?; + + // Cast the WKB value to `target`, import its native storage as a Vortex array. + let to_storage = |target: &GeoArrowType| -> VortexResult { + let native = + cast(&wkb, target).map_err(|e| vortex_err!("failed to cast WKB literal: {e}"))?; + ArrayRef::from_arrow(native.to_array_ref().as_ref(), false) + }; + + let scalar = match Wkb::try_from_bytes(bytes)?.geometry_type() { + GeometryType::Point => { + let target = GeoArrowType::Point( + PointType::new(Dimension::XY, metadata).with_coord_type(CoordType::Separated), + ); + geo_ext_scalar(Point, to_storage(&target)?)? + } + GeometryType::Polygon => { + let target = GeoArrowType::Polygon( + PolygonType::new(Dimension::XY, metadata).with_coord_type(CoordType::Separated), + ); + geo_ext_scalar(Polygon, to_storage(&target)?)? + } + GeometryType::MultiPolygon => { + let target = GeoArrowType::MultiPolygon( + MultiPolygonType::new(Dimension::XY, metadata) + .with_coord_type(CoordType::Separated), + ); + geo_ext_scalar(MultiPolygon, to_storage(&target)?)? + } + _ => return Ok(None), + }; + Ok(Some(scalar)) +} + +/// Wrap cast-from-WKB `storage` in its `vtable` extension type and pull out the single scalar. +// `scalar_at` is deprecated for `execute_scalar`, but there is no execution context at plan time. +#[allow(deprecated)] +fn geo_ext_scalar>( + vtable: V, + storage: ArrayRef, +) -> VortexResult { + let ext = ExtDType::try_with_vtable(vtable, GeoMetadata::default(), storage.dtype().clone())? + .erased(); + ExtensionArray::try_new(ext, storage)? + .into_array() + .scalar_at(0) +} + /// Extension metadata that is common to all the geospatial extension types. /// /// Currently, this is just the coordinate reference system (CRS). @@ -116,7 +188,13 @@ pub(crate) fn geo_metadata_from_arrow(metadata: &Metadata) -> GeoMetadata { #[cfg(test)] mod tests { use prost::Message; + use vortex_array::dtype::DType; + use vortex_error::VortexResult; + use vortex_error::vortex_err; + use super::Point; + use super::Polygon; + use super::native_geometry_scalar_from_wkb; use crate::extension::GeoMetadata; #[test] @@ -131,4 +209,44 @@ mod tests { let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); assert_eq!(decoded, meta); } + + /// A little-endian WKB `POINT` literal decodes to the native `Point` extension scalar (the Q1 + /// fast-path operand). + #[test] + fn decodes_wkb_point_to_native() -> VortexResult<()> { + let mut wkb = vec![1u8]; // little-endian byte order + wkb.extend_from_slice(&1u32.to_le_bytes()); // geometry type: point + wkb.extend_from_slice(&1.0f64.to_le_bytes()); // x + wkb.extend_from_slice(&2.0f64.to_le_bytes()); // y + + let scalar = native_geometry_scalar_from_wkb(&wkb)?.expect("a point scalar"); + let DType::Extension(ext) = scalar.dtype() else { + panic!("expected an extension dtype, got {}", scalar.dtype()); + }; + assert!(ext.is::()); + Ok(()) + } + + /// A little-endian WKB `POLYGON` literal decodes to the native `Polygon` extension scalar (the Q3 + /// point→polygon operand), proving the literal decode is not point-only. + #[test] + fn decodes_wkb_polygon_to_native() -> VortexResult<()> { + let ring = [(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (0.0, 0.0)]; + let mut wkb = vec![1u8]; // little-endian byte order + wkb.extend_from_slice(&3u32.to_le_bytes()); // geometry type: polygon + wkb.extend_from_slice(&1u32.to_le_bytes()); // one ring + let ring_len = u32::try_from(ring.len()).map_err(|e| vortex_err!("{e}"))?; + wkb.extend_from_slice(&ring_len.to_le_bytes()); + for (x, y) in ring { + wkb.extend_from_slice(&f64::to_le_bytes(x)); + wkb.extend_from_slice(&f64::to_le_bytes(y)); + } + + let scalar = native_geometry_scalar_from_wkb(&wkb)?.expect("a polygon scalar"); + let DType::Extension(ext) = scalar.dtype() else { + panic!("expected an extension dtype, got {}", scalar.dtype()); + }; + assert!(ext.is::()); + Ok(()) + } } diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs index feb7ea833aa..97e23c6d721 100644 --- a/vortex-geo/src/scalar_fn/distance.rs +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -10,8 +10,12 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::Constant; use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::ScalarFnArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; @@ -27,6 +31,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_session::VortexSession; +use crate::extension::Point; +use crate::extension::coordinate::coordinate_from_struct; use crate::extension::geometries; use crate::extension::single_geometry; @@ -99,14 +105,25 @@ impl ScalarFnVTable for GeoDistance { (Some(query), None) => distances_to_constant(&b, query.scalar(), ctx), (None, Some(query)) => distances_to_constant(&a, query.scalar(), ctx), (None, None) => { - let ag = geometries(&a, ctx)?; - let bg = geometries(&b, ctx)?; vortex_ensure!( - ag.len() == bg.len(), + a.len() == b.len(), "geo distance: operand length mismatch {} vs {}", - ag.len(), - bg.len() + a.len(), + b.len() ); + // Fast path: two Point columns — distance straight over their `x`/`y` f64 buffers. + if is_nonnull_point(a.dtype()) && is_nonnull_point(b.dtype()) { + let (xa, ya) = point_xy(&a, ctx)?; + let (xb, yb) = point_xy(&b, ctx)?; + return Ok(point_distances( + xa.as_slice::().iter().copied(), + ya.as_slice::().iter().copied(), + xb.as_slice::().iter().copied(), + yb.as_slice::().iter().copied(), + )); + } + let ag = geometries(&a, ctx)?; + let bg = geometries(&b, ctx)?; let distances = ag.iter().zip(&bg).map(|(x, y)| Euclidean.distance(x, y)); Ok(PrimitiveArray::from_iter(distances).into_array()) } @@ -121,12 +138,73 @@ fn distances_to_constant( query: &Scalar, ctx: &mut ExecutionCtx, ) -> VortexResult { + // Fast path: Point column vs constant Point — `x`/`y` f64 buffers, broadcasting the constant. + if is_nonnull_point(operand.dtype()) && is_point(query.dtype()) { + let q = coordinate_from_struct(&query.as_extension().to_storage_scalar())?; + let (xs, ys) = point_xy(operand, ctx)?; + return Ok(point_distances( + xs.as_slice::().iter().copied(), + ys.as_slice::().iter().copied(), + std::iter::repeat(q.x), + std::iter::repeat(q.y), + )); + } + let query = single_geometry(query, ctx)?; let geoms = geometries(operand, ctx)?; let distances = geoms.iter().map(|g| Euclidean.distance(g, &query)); Ok(PrimitiveArray::from_iter(distances).into_array()) } +/// Extract the `x` and `y` `f64` columns from a native `Point` operand, for the columnar fast paths. +fn point_xy( + operand: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { + let storage = operand + .clone() + .execute::(ctx)? + .storage_array() + .clone() + .execute::(ctx)?; + let xs = storage + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = storage + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + Ok((xs, ys)) +} + +/// Per-row planar distance `sqrt(dx² + dy²)` over two `(x, y)` f64 streams; a constant side is fed +/// as `repeat(c)`. +fn point_distances( + xa: impl Iterator, + ya: impl Iterator, + xb: impl Iterator, + yb: impl Iterator, +) -> ArrayRef { + let distances = xa.zip(ya).zip(xb.zip(yb)).map(|((xa, ya), (xb, yb))| { + let (dx, dy) = (xa - xb, ya - yb); + (dx * dx + dy * dy).sqrt() + }); + PrimitiveArray::from_iter(distances).into_array() +} + +/// Whether `dtype` is the native `Point` extension (eligible for the columnar fast path). +fn is_point(dtype: &DType) -> bool { + dtype + .as_extension_opt() + .is_some_and(|ext| ext.is::()) +} + +/// A non-nullable native `Point` — a column operand the fast path can read straight from `x`/`y`. +fn is_nonnull_point(dtype: &DType) -> bool { + is_point(dtype) && !dtype.is_nullable() +} + #[cfg(test)] mod tests { use vortex_array::ArrayRef; From 94674e0db6c9cd29ac37ce0e7cb480cb58eaa017 Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Mon, 29 Jun 2026 14:30:58 -0400 Subject: [PATCH 10/10] chore: address PR review comments - Replace non-ASCII characters in comments with ASCII. - Document why catalog registration needs RunFunctionInTransaction. - Reference the FFI function in register_geo_aliases doc. Signed-off-by: Nemo Yu --- vortex-bench/src/spatialbench/benchmark.rs | 2 +- vortex-duckdb/cpp/expr.cpp | 2 +- vortex-duckdb/src/convert/expr.rs | 2 +- vortex-duckdb/src/duckdb/database.rs | 4 ++-- vortex-geo/src/extension/mod.rs | 6 ++---- vortex-geo/src/scalar_fn/distance.rs | 8 ++++---- 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs index f06b76579b1..a1be1fedbb9 100644 --- a/vortex-bench/src/spatialbench/benchmark.rs +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -237,7 +237,7 @@ fn strip_wkb_wrappers(sql: &str) -> String { /// Rewrite `ST_DWithin(..)` calls with a geometry literal operand (`ST_GeomFromText`) to the /// `vortex_dwithin` alias; leave the rest as `ST_DWithin`. `vortex_dwithin` is only correct when it -/// pushes (its bind is cleared), and only single-table filters against a literal push — a join (two +/// pushes (its bind is cleared), and only single-table filters against a literal push - a join (two /// columns) does not, so it must keep `ST_DWithin`. fn route_pushable_dwithin(sql: &str) -> String { const OPEN: &str = "ST_DWithin("; diff --git a/vortex-duckdb/cpp/expr.cpp b/vortex-duckdb/cpp/expr.cpp index 73d2aac1355..210aed50147 100644 --- a/vortex-duckdb/cpp/expr.cpp +++ b/vortex-duckdb/cpp/expr.cpp @@ -58,7 +58,7 @@ extern "C" duckdb_state duckdb_vx_register_geo_aliases(duckdb_database ffi_db) { catalog.CreateFunction(context, info); }); } catch (const std::exception &) { - // No `spatial` loaded → no `ST_DWithin` to alias; nothing to register. + // No `spatial` loaded, so there is no `ST_DWithin` to alias; nothing to register. return DuckDBSuccess; } return DuckDBSuccess; diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index 1486f15bbd4..42b5d7870cb 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -137,7 +137,7 @@ fn try_from_geo_function( Ok(Some(expr)) } -/// Geo UDFs that `try_from_geo_function` lowers — shared with `can_push_expression` so the pushable +/// Geo UDFs that `try_from_geo_function` lowers - shared with `can_push_expression` so the pushable /// and lowered sets can't drift. fn is_geo_function(name: &str) -> bool { matches!( diff --git a/vortex-duckdb/src/duckdb/database.rs b/vortex-duckdb/src/duckdb/database.rs index e85d3d4bc5f..e0b357c6b0e 100644 --- a/vortex-duckdb/src/duckdb/database.rs +++ b/vortex-duckdb/src/duckdb/database.rs @@ -91,8 +91,8 @@ impl DatabaseRef { Ok(()) } - /// Register `vortex_dwithin`, a non-throwing alias of the `spatial` extension's `ST_DWithin`, so - /// the radius predicate pushes into the Vortex scan. + /// Register the non-throwing `vortex_dwithin` alias of `ST_DWithin` (via the C + /// `duckdb_vx_register_geo_aliases`) so the radius predicate pushes into the Vortex scan. pub fn register_geo_aliases(&self) -> VortexResult<()> { duckdb_try!( unsafe { cpp::duckdb_vx_register_geo_aliases(self.as_ptr()) }, diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index fc2faf4c736..3390c4e2e78 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -210,8 +210,7 @@ mod tests { assert_eq!(decoded, meta); } - /// A little-endian WKB `POINT` literal decodes to the native `Point` extension scalar (the Q1 - /// fast-path operand). + /// A little-endian WKB `POINT` literal decodes to the native `Point` extension scalar. #[test] fn decodes_wkb_point_to_native() -> VortexResult<()> { let mut wkb = vec![1u8]; // little-endian byte order @@ -227,8 +226,7 @@ mod tests { Ok(()) } - /// A little-endian WKB `POLYGON` literal decodes to the native `Polygon` extension scalar (the Q3 - /// point→polygon operand), proving the literal decode is not point-only. + /// A little-endian WKB `POLYGON` literal decodes to the native `Polygon` extension scalar. #[test] fn decodes_wkb_polygon_to_native() -> VortexResult<()> { let ring = [(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (0.0, 0.0)]; diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs index 97e23c6d721..4bbeff94062 100644 --- a/vortex-geo/src/scalar_fn/distance.rs +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -111,7 +111,7 @@ impl ScalarFnVTable for GeoDistance { a.len(), b.len() ); - // Fast path: two Point columns — distance straight over their `x`/`y` f64 buffers. + // Fast path: two Point columns, distance straight over their `x`/`y` f64 buffers. if is_nonnull_point(a.dtype()) && is_nonnull_point(b.dtype()) { let (xa, ya) = point_xy(&a, ctx)?; let (xb, yb) = point_xy(&b, ctx)?; @@ -138,7 +138,7 @@ fn distances_to_constant( query: &Scalar, ctx: &mut ExecutionCtx, ) -> VortexResult { - // Fast path: Point column vs constant Point — `x`/`y` f64 buffers, broadcasting the constant. + // Fast path: Point column vs constant Point, `x`/`y` f64 buffers, broadcasting the constant. if is_nonnull_point(operand.dtype()) && is_point(query.dtype()) { let q = coordinate_from_struct(&query.as_extension().to_storage_scalar())?; let (xs, ys) = point_xy(operand, ctx)?; @@ -178,7 +178,7 @@ fn point_xy( Ok((xs, ys)) } -/// Per-row planar distance `sqrt(dx² + dy²)` over two `(x, y)` f64 streams; a constant side is fed +/// Per-row planar distance `sqrt(dx^2 + dy^2)` over two `(x, y)` f64 streams; a constant side is fed /// as `repeat(c)`. fn point_distances( xa: impl Iterator, @@ -200,7 +200,7 @@ fn is_point(dtype: &DType) -> bool { .is_some_and(|ext| ext.is::()) } -/// A non-nullable native `Point` — a column operand the fast path can read straight from `x`/`y`. +/// A non-nullable native `Point`, a column operand the fast path can read straight from `x`/`y`. fn is_nonnull_point(dtype: &DType) -> bool { is_point(dtype) && !dtype.is_nullable() }