Skip to content

Commit b7b1408

Browse files
feat(vortex-bench): support vortex native geo types into SpatialBench, and wire into benchmark orchestrator (#8623)
## Summary <!-- Why are you proposing this change, and what is its impact? Is it part of a long term effort, or a bigger change? If this PR is related to a tracked effort or an open issue, please link to the relevant issue. --> Wires vortex geo native format into SpatialBench, and wire into the `vx-bench` / `bench-orchestrator` pipeline so it can be run end-to-end like the other benchmarks. Running command: ``` vx-bench run spatialbench -e duckdb -f parquet,vortex,vortex-native --opt scale-factor=1.0 --queries 1,2,3,4,5,6,7,8,9 ``` ## Limitation <!-- No need to duplicate information from the previous section, but if you're touching many parts of the code base, its worth explicitly noting the important changes or how they are tested. --> - DuckDB-only as before. For now SpatialBench queries use DuckDB-specific ST_* spatial SQL that DataFusion has no functions for yet. There is a a single ad-hoc entry in `BENCHMARK_ENGINES = { SPATIALBENCH: {DUCKDB} }`. - Queries 10, 11, 12 is timeout simply because DuckDB poorly support on Spatial index. ## Performance SF=1.0 ``` ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Query ┃ duckdb:parquet (base) ┃ duckdb:vortex ┃ duckdb:vortex-native ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ 1 │ 40.8ms │ 13.8ms (0.34x) │ 23.2ms (0.57x) │ │ 2 │ 181.2ms │ 26.3ms (0.14x) │ 35.3ms (0.20x) │ │ 3 │ 57.8ms │ 19.8ms (0.34x) │ 30.9ms (0.53x) │ │ 4 │ 331.9ms │ 61.8ms (0.19x) │ 111.0ms (0.33x) │ │ 5 │ 356.1ms │ 294.3ms (0.83x) │ 299.3ms (0.84x) │ │ 6 │ 441.9ms │ 86.3ms (0.20x) │ 135.1ms (0.31x) │ │ 7 │ 157.2ms │ 73.7ms (0.47x) │ 93.1ms (0.59x) │ │ 8 │ 197.8ms │ 78.1ms (0.39x) │ 91.9ms (0.46x) │ │ 9 │ 20.3ms │ 18.6ms (0.91x) │ 20.7ms (1.02x) │ └───────┴───────────────────────┴─────────────────┴──────────────────────┘ ``` SF=3 ``` ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Query ┃ duckdb:parquet (base) ┃ duckdb:vortex ┃ duckdb:vortex-native ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ 1 │ 51.5ms │ 26.6ms (0.52x) │ 61.6ms (1.20x) │ │ 2 │ 127.6ms │ 56.2ms (0.44x) │ 93.1ms (0.73x) │ │ 3 │ 69.1ms │ 46.3ms (0.67x) │ 80.0ms (1.16x) │ │ 4 │ 543.4ms │ 64.5ms (0.12x) │ 119.6ms (0.22x) │ │ 5 │ 980.6ms │ 881.6ms (0.90x) │ 894.4ms (0.91x) │ │ 6 │ 660.9ms │ 133.8ms (0.20x) │ 233.2ms (0.35x) │ │ 7 │ 255.7ms │ 240.6ms (0.94x) │ 264.9ms (1.04x) │ │ 8 │ 267.6ms │ 247.4ms (0.92x) │ 299.7ms (1.12x) │ │ 9 │ 30.1ms │ 28.8ms (0.96x) │ 30.6ms (1.02x) │ └───────┴───────────────────────┴─────────────────┴──────────────────────┘ ``` SF=10 ``` ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Query ┃ duckdb:parquet (base) ┃ duckdb:vortex ┃ duckdb:vortex-native ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ 1 │ 160.4ms │ 128.5ms (0.80x) │ 214.8ms (1.34x) │ │ 2 │ 254.3ms │ 239.8ms (0.94x) │ 325.9ms (1.28x) │ │ 3 │ 231.0ms │ 198.7ms (0.86x) │ 284.4ms (1.23x) │ │ 4 │ 188.0ms │ 124.2ms (0.66x) │ 147.9ms (0.79x) │ │ 5 │ 3.14s │ 3.05s (0.97x) │ 2.92s (0.93x) │ │ 6 │ 480.5ms │ 361.7ms (0.75x) │ 467.7ms (0.97x) │ │ 7 │ 992.8ms │ 1.02s (1.02x) │ 915.2ms (0.92x) │ │ 8 │ 1.07s │ 961.8ms (0.90x) │ 1.02s (0.95x) │ │ 9 │ 34.2ms │ 34.1ms (1.00x) │ 43.5ms (1.27x) │ └───────┴───────────────────────┴─────────────────┴──────────────────────┘ ``` <!-- Are there any user-facing changes that might require documentation updates Is any public API changed? --> Takeaways: for now we do not have only pushdown to execute predicate in vortex kernel, so geo native types pay the tax of converting to geo-binary (talk to duckdb) but no gains, so in SF=10 vortex-native is less efficient compared with vortex-wkb. --------- Signed-off-by: Nemo Yu <zyu379@wisc.edu>
1 parent 574326f commit b7b1408

18 files changed

Lines changed: 400 additions & 32 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bench-orchestrator/bench_orchestrator/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class Format(Enum):
3535
PARQUET = "parquet"
3636
VORTEX = "vortex"
3737
VORTEX_COMPACT = "vortex-compact"
38+
VORTEX_NATIVE = "vortex-geo-native"
3839
DUCKDB = "duckdb"
3940
LANCE = "lance"
4041

@@ -68,6 +69,7 @@ class Benchmark(Enum):
6869
Format.PARQUET,
6970
Format.VORTEX,
7071
Format.VORTEX_COMPACT,
72+
Format.VORTEX_NATIVE,
7173
Format.DUCKDB,
7274
],
7375
Engine.LANCE: [Format.LANCE],

bench-orchestrator/tests/test_config.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,23 @@ def test_parse_formats_json_accepts_ci_format_arrays() -> None:
2626
assert formats == [Format.PARQUET, Format.VORTEX, Format.DUCKDB]
2727

2828

29+
def test_parse_formats_json_accepts_vortex_native() -> None:
30+
formats = parse_formats_json('["parquet","vortex","vortex-geo-native"]')
31+
32+
assert formats == [Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE]
33+
34+
35+
def test_resolve_axis_targets_offers_vortex_native_on_duckdb_only() -> None:
36+
# vortex-geo-native is a DuckDB-only lane; the DataFusion axis is dropped as unsupported.
37+
targets, warnings = resolve_axis_targets(
38+
[Engine.DATAFUSION, Engine.DUCKDB],
39+
[Format.VORTEX_NATIVE],
40+
)
41+
42+
assert targets == [BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE)]
43+
assert warnings == ["Format vortex-geo-native is not supported by engine datafusion"]
44+
45+
2946
def test_resolve_axis_targets_filters_unsupported_combinations() -> None:
3047
targets, warnings = resolve_axis_targets(
3148
[Engine.DATAFUSION, Engine.DUCKDB],
@@ -55,6 +72,23 @@ def test_resolve_axis_targets_skips_engines_a_benchmark_cannot_run() -> None:
5572
assert warnings == ["Benchmark spatialbench does not support engine datafusion"]
5673

5774

75+
def test_resolve_axis_targets_expands_spatialbench_three_lanes() -> None:
76+
# The single-command three-lane comparison: parquet, WKB vortex, and native-geometry vortex, all
77+
# on DuckDB.
78+
targets, warnings = resolve_axis_targets(
79+
[Engine.DUCKDB],
80+
[Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE],
81+
Benchmark.SPATIALBENCH,
82+
)
83+
84+
assert targets == [
85+
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.PARQUET),
86+
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX),
87+
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE),
88+
]
89+
assert warnings == []
90+
91+
5892
def test_validate_targets_rejects_engine_a_benchmark_cannot_run() -> None:
5993
errors = validate_targets(
6094
[BenchmarkTarget(engine=Engine.DATAFUSION, format=Format.PARQUET)],

bench-orchestrator/tests/test_executor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,19 @@ def test_build_command_adds_duckdb_cleanup_flag() -> None:
3333
assert "scale-factor=1.0" in cmd
3434

3535

36+
def test_build_command_serializes_vortex_native_format() -> None:
37+
executor = BenchmarkExecutor(Path("/tmp/duckdb-bench"), Engine.DUCKDB)
38+
39+
cmd = executor.build_command(
40+
benchmark=Benchmark.SPATIALBENCH,
41+
formats=[Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE],
42+
iterations=1,
43+
options={"scale-factor": "1.0"},
44+
)
45+
46+
assert "parquet,vortex,vortex-geo-native" in cmd
47+
48+
3649
def test_build_command_omits_formats_for_lance_backend() -> None:
3750
executor = BenchmarkExecutor(Path("/tmp/lance-bench"), Engine.LANCE)
3851

benchmarks/datafusion-bench/src/lib.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,9 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
111111
Format::Csv => Arc::new(CsvFormat::default()) as _,
112112
Format::Arrow => Arc::new(ArrowFormat),
113113
Format::Parquet => Arc::new(ParquetFormat::new()),
114-
Format::OnDiskVortex | Format::VortexCompact => Arc::new(VortexFormat::new_with_options(
115-
SESSION.clone(),
116-
vortex_table_options(),
117-
)),
114+
Format::OnDiskVortex | Format::VortexCompact | Format::VortexNative => Arc::new(
115+
VortexFormat::new_with_options(SESSION.clone(), vortex_table_options()),
116+
),
118117
Format::OnDiskDuckDB | Format::Lance => {
119118
unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
120119
}

benchmarks/duckdb-bench/src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,10 @@ impl DuckClient {
169169
file_format: Format,
170170
) -> Result<()> {
171171
let object_type = match file_format {
172-
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW",
172+
Format::Parquet
173+
| Format::OnDiskVortex
174+
| Format::VortexCompact
175+
| Format::VortexNative => "VIEW",
173176
Format::OnDiskDuckDB => "TABLE",
174177
Format::Lance => {
175178
anyhow::bail!(

benchmarks/duckdb-bench/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ fn main() -> anyhow::Result<()> {
142142
// OnDiskDuckDB tables are created during register_tables by loading from Parquet
143143
_ => {}
144144
}
145+
benchmark.prepare_format(format, &base_path).await?;
145146
}
146147

147148
anyhow::Ok(())

vortex-bench/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ async-trait = { workspace = true }
3434
bzip2 = { workspace = true }
3535
clap = { workspace = true, features = ["derive"] }
3636
futures = { workspace = true }
37+
geoarrow = { workspace = true }
38+
geoarrow-cast = { workspace = true }
3739
get_dir = { workspace = true }
3840
glob = { workspace = true }
3941
humansize = { workspace = true }

vortex-bench/src/benchmark.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
//! Core benchmark trait and types.
55
6+
use std::path::Path;
7+
68
use arrow_schema::Schema;
79
use glob::Pattern;
810
use url::Url;
@@ -47,6 +49,13 @@ pub trait Benchmark: Send + Sync {
4749
/// call this method to ensure base data exists, then perform their own format conversion.
4850
async fn generate_base_data(&self) -> anyhow::Result<()>;
4951

52+
/// Prepare benchmark- and format-specific data beyond the Parquet base that
53+
/// [`Benchmark::generate_base_data`] produced. Called once per requested format, after the base
54+
/// data exists. Default: nothing.
55+
async fn prepare_format(&self, _format: Format, _base_path: &Path) -> anyhow::Result<()> {
56+
Ok(())
57+
}
58+
5059
/// Get expected row counts for validation (optional)
5160
/// If None, no validation will be performed
5261
fn expected_row_counts(&self) -> Option<Vec<usize>> {

vortex-bench/src/lib.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,11 @@ use vortex::session::VortexSession;
7676
#[global_allocator]
7777
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
7878

79-
pub static SESSION: LazyLock<VortexSession> =
80-
LazyLock::new(|| VortexSession::default().with_tokio());
79+
pub static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
80+
let session = VortexSession::default().with_tokio();
81+
vortex_geo::initialize(&session);
82+
session
83+
});
8184

8285
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
8386
pub struct Target {
@@ -146,6 +149,9 @@ pub enum Format {
146149
#[clap(name = "vortex-compact")]
147150
#[serde(rename = "vortex-compact")]
148151
VortexCompact,
152+
#[clap(name = "vortex-geo-native")]
153+
#[serde(rename = "vortex-geo-native")]
154+
VortexNative,
149155
#[clap(name = "duckdb")]
150156
#[serde(rename = "duckdb")]
151157
OnDiskDuckDB,
@@ -185,6 +191,7 @@ impl Format {
185191
Format::Parquet => "parquet",
186192
Format::OnDiskVortex => "vortex-file-compressed",
187193
Format::VortexCompact => "vortex-compact",
194+
Format::VortexNative => "vortex-geo-native",
188195
Format::OnDiskDuckDB => "duckdb",
189196
Format::Lance => "lance",
190197
}
@@ -197,6 +204,7 @@ impl Format {
197204
Format::Parquet => "parquet",
198205
Format::OnDiskVortex => "vortex",
199206
Format::VortexCompact => "vortex",
207+
Format::VortexNative => "vortex",
200208
Format::OnDiskDuckDB => "duckdb",
201209
Format::Lance => "lance",
202210
}

0 commit comments

Comments
 (0)