Skip to content

Commit 0a53378

Browse files
committed
Move all benchmark data generation to data-gen tool
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent aeae579 commit 0a53378

6 files changed

Lines changed: 101 additions & 75 deletions

File tree

.github/workflows/sql-benchmarks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ jobs:
574574
env:
575575
RUSTFLAGS: "-C target-cpu=native"
576576
run: |
577+
# vx-bench prepare-data shells out to target/release_debug/data-gen.
577578
packages=(--bin data-gen --bin datafusion-bench --bin duckdb-bench)
578579
if [ "${{ inputs.mode }}" != "pr" ]; then
579580
packages+=(--bin lance-bench)

benchmarks/datafusion-bench/src/main.rs

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,15 @@ use vortex::io::filesystem::FileSystemRef;
3131
use vortex::scan::DataSourceRef;
3232
use vortex_bench::Benchmark;
3333
use vortex_bench::BenchmarkArg;
34-
use vortex_bench::CompactionStrategy;
3534
use vortex_bench::Engine;
3635
use vortex_bench::Format;
3736
use vortex_bench::Opt;
3837
use vortex_bench::Opts;
3938
use vortex_bench::SESSION;
40-
use vortex_bench::conversions::convert_parquet_directory_to_vortex;
4139
use vortex_bench::create_benchmark;
4240
use vortex_bench::create_output_writer;
4341
use vortex_bench::display::DisplayFormat;
42+
use vortex_bench::require_prepared_data;
4443
use vortex_bench::runner::BenchmarkMode;
4544
use vortex_bench::runner::BenchmarkQueryResult;
4645
use vortex_bench::runner::SqlBenchmarkRunner;
@@ -131,29 +130,7 @@ async fn main() -> anyhow::Result<()> {
131130
args.exclude_queries.as_ref(),
132131
);
133132

134-
// Generate Vortex files from Parquet for any Vortex formats requested
135-
if benchmark.data_url().scheme() == "file" {
136-
benchmark.generate_base_data().await?;
137-
138-
let base_path = benchmark
139-
.data_url()
140-
.to_file_path()
141-
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;
142-
143-
for format in args.formats.iter() {
144-
match format {
145-
Format::OnDiskVortex => {
146-
convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Default)
147-
.await?;
148-
}
149-
Format::VortexCompact => {
150-
convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Compact)
151-
.await?;
152-
}
153-
_ => {}
154-
}
155-
}
156-
}
133+
require_prepared_data(&*benchmark, &args.formats)?;
157134

158135
let benchmark_name = benchmark.dataset().to_string();
159136

benchmarks/duckdb-bench/src/lib.rs

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,38 @@ impl DuckClient {
5151
format!("{name}/{format}/", name = benchmark.dataset_name()).to_data_path()
5252
};
5353
let dir = base_path.join(format.name());
54-
std::fs::create_dir_all(&dir)?;
5554
let db_path = dir.join("duckdb.db");
5655

56+
if format == Format::OnDiskDuckDB && data_url.scheme() != "file" {
57+
anyhow::bail!("DuckDB format requires local data prepared by data-gen");
58+
}
59+
60+
if format == Format::OnDiskDuckDB {
61+
if !db_path.exists() {
62+
anyhow::bail!(
63+
"prepared DuckDB database is missing at {}. Generate it with \
64+
`vx-bench prepare-data {} --formats-json '[\"duckdb\"]'` or \
65+
`cargo run --bin data-gen -- {} --formats duckdb` using the same --opt values.",
66+
db_path.display(),
67+
benchmark.dataset_name(),
68+
benchmark.dataset_name(),
69+
);
70+
}
71+
} else {
72+
std::fs::create_dir_all(&dir)?;
73+
}
74+
5775
tracing::info!(db_path = %db_path.display(), "Opening DuckDB");
5876

5977
if delete_database && db_path.exists() {
60-
std::fs::remove_file(&db_path)?;
78+
if format == Format::OnDiskDuckDB {
79+
tracing::info!(
80+
db_path = %db_path.display(),
81+
"Keeping prepared DuckDB format database"
82+
);
83+
} else {
84+
std::fs::remove_file(&db_path)?;
85+
}
6186
}
6287

6388
let (db, connection) = Self::open_and_setup_database(Some(db_path.clone()), threads)?;
@@ -147,9 +172,14 @@ impl DuckClient {
147172
benchmark: &B,
148173
file_format: Format,
149174
) -> Result<()> {
175+
if file_format == Format::OnDiskDuckDB {
176+
// Native DuckDB data is materialized by data-gen. The opened database already
177+
// contains benchmark tables, so there is nothing to register here.
178+
return Ok(());
179+
}
180+
150181
let object_type = match file_format {
151182
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW",
152-
Format::OnDiskDuckDB => "TABLE",
153183
Format::Lance => {
154184
anyhow::bail!(
155185
"Lance format is not supported for DuckDB engine. \
@@ -159,11 +189,7 @@ impl DuckClient {
159189
format => anyhow::bail!("Format {format} isn't supported for DuckDB"),
160190
};
161191

162-
// DuckDB loads from parquet for OnDiskDuckDB format
163-
let load_format = match file_format {
164-
Format::Parquet | Format::OnDiskDuckDB => Format::Parquet,
165-
f => f,
166-
};
192+
let load_format = file_format;
167193

168194
// Get the base URL for the format's data directory
169195
let format_url = benchmark.format_path(load_format, benchmark.data_url())?;

benchmarks/duckdb-bench/src/main.rs

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,16 @@ use std::path::PathBuf;
88
use clap::Parser;
99
use clap::value_parser;
1010
use duckdb_bench::DuckClient;
11-
use tokio::runtime::Runtime;
1211
use vortex::metrics::tracing::set_global_labels;
1312
use vortex_bench::BenchmarkArg;
14-
use vortex_bench::CompactionStrategy;
1513
use vortex_bench::Engine;
1614
use vortex_bench::Format;
1715
use vortex_bench::Opt;
1816
use vortex_bench::Opts;
19-
use vortex_bench::conversions::convert_parquet_directory_to_vortex;
2017
use vortex_bench::create_benchmark;
2118
use vortex_bench::create_output_writer;
2219
use vortex_bench::display::DisplayFormat;
20+
use vortex_bench::require_prepared_data;
2321
use vortex_bench::runner::BenchmarkMode;
2422
use vortex_bench::runner::SqlBenchmarkRunner;
2523
use vortex_bench::runner::filter_queries;
@@ -110,43 +108,7 @@ fn main() -> anyhow::Result<()> {
110108
anyhow::bail!("provide a format with --formats");
111109
}
112110

113-
// Generate Vortex files from Parquet for any Vortex formats requested
114-
if benchmark.data_url().scheme() == "file" {
115-
// This is ugly, but otherwise some complicated async interaction might result in a deadlock
116-
let runtime = Runtime::new()?;
117-
118-
runtime.block_on(async {
119-
benchmark.generate_base_data().await?;
120-
121-
let base_path = benchmark
122-
.data_url()
123-
.to_file_path()
124-
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;
125-
126-
for format in args.formats.iter().copied() {
127-
match format {
128-
Format::OnDiskVortex => {
129-
convert_parquet_directory_to_vortex(
130-
&base_path,
131-
CompactionStrategy::Default,
132-
)
133-
.await?;
134-
}
135-
Format::VortexCompact => {
136-
convert_parquet_directory_to_vortex(
137-
&base_path,
138-
CompactionStrategy::Compact,
139-
)
140-
.await?;
141-
}
142-
// OnDiskDuckDB tables are created during register_tables by loading from Parquet
143-
_ => {}
144-
}
145-
}
146-
147-
anyhow::Ok(())
148-
})?;
149-
}
111+
require_prepared_data(&*benchmark, &args.formats)?;
150112

151113
let mut runner = SqlBenchmarkRunner::new(
152114
&*benchmark,

vortex-bench/src/benchmark.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ pub trait Benchmark: Send + Sync {
3636
/// This is the canonical source data that can be converted to other formats.
3737
/// This should be idempotent - safe to call multiple times.
3838
///
39-
/// Format-specific benchmark binaries (like lance-bench, datafusion-bench, duckdb-bench) should
40-
/// call this method to ensure base data exists, then perform their own format conversion.
39+
/// The `data-gen` binary calls this method before creating requested derived formats.
40+
/// Engine-specific benchmark binaries should only run against prepared data.
4141
async fn generate_base_data(&self) -> anyhow::Result<()>;
4242

4343
/// Get expected row counts for validation (optional)

vortex-bench/src/lib.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,66 @@ impl CompactionStrategy {
245245
}
246246
}
247247

248+
/// Verify that local data has already been prepared for the requested benchmark formats.
249+
///
250+
/// Engine-specific benchmark binaries call this before running queries. Data generation itself
251+
/// belongs to the `data-gen` binary.
252+
pub fn require_prepared_data<B>(benchmark: &B, formats: &[Format]) -> anyhow::Result<()>
253+
where
254+
B: Benchmark + ?Sized,
255+
{
256+
if benchmark.data_url().scheme() != "file" {
257+
return Ok(());
258+
}
259+
260+
let base_path = benchmark
261+
.data_url()
262+
.to_file_path()
263+
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;
264+
265+
let mut missing = Vec::new();
266+
for format in formats.iter().copied().unique() {
267+
let required_path = match format {
268+
Format::Arrow | Format::Parquet => base_path.join(Format::Parquet.name()),
269+
Format::OnDiskVortex => base_path.join(Format::OnDiskVortex.name()),
270+
Format::VortexCompact => base_path.join(Format::VortexCompact.name()),
271+
Format::OnDiskDuckDB => base_path
272+
.join(Format::OnDiskDuckDB.name())
273+
.join("duckdb.db"),
274+
format => base_path.join(format.name()),
275+
};
276+
277+
if !required_path.exists() {
278+
missing.push((format, required_path));
279+
}
280+
}
281+
282+
if missing.is_empty() {
283+
return Ok(());
284+
}
285+
286+
let missing_data = missing
287+
.iter()
288+
.map(|(format, path)| format!("{format} ({})", path.display()))
289+
.join(", ");
290+
let requested_formats = formats
291+
.iter()
292+
.copied()
293+
.unique()
294+
.map(|format| format!("\"{format}\""))
295+
.join(",");
296+
297+
anyhow::bail!(
298+
"prepared data is missing for {}: {missing_data}. Generate it first with \
299+
`vx-bench prepare-data {} --formats-json '[{requested_formats}]'` or \
300+
`cargo run --bin data-gen -- {} --formats {}` using the same --opt values.",
301+
benchmark.dataset_display(),
302+
benchmark.dataset_name(),
303+
benchmark.dataset_name(),
304+
formats.iter().copied().unique().join(","),
305+
);
306+
}
307+
248308
/// CLI argument for selecting which benchmark to run.
249309
#[derive(clap::ValueEnum, Clone, Copy)]
250310
pub enum BenchmarkArg {

0 commit comments

Comments
 (0)