diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22834c6..cf25cad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,12 +43,44 @@ jobs: run: cargo msrv verify bench-codspeed: - name: Benchmark with Codspeed + name: Benchmark with Codspeed (${{ matrix.name }}) runs-on: - runs-on=${{ github.run_id }} - - family=c6id.8xlarge + - family=c8i.8xlarge - image=ubuntu24-full-x64 - - tag=bench-codspeed + - tag=${{ matrix.tag }} + strategy: + fail-fast: false + matrix: + include: + # Deterministic instruction-count benchmarks under Valgrind, for every suite + # EXCEPT bit_transpose (which is measured per-tier in walltime below, so it has + # a single home runner). Keep this list in sync with the [[bench]] targets. + - name: simulation + mode: simulation + tag: bench-codspeed + rustflags: "-C target-feature=+avx2" + build-args: "--bench bitpacking --bench ffor --bench bitpacking_cmp --bench delta --bench transpose --bench rle" + # Real wall-clock timings, one runner per Intel feature tier. Each tier is built + # with its own -C target-feature flag, and the bit_transpose benches are gated on + # that feature via #[bench()], so each one compiles — and runs — on exactly + # one runner. Measured on runs-on x86 (c8i / Granite Rapids) since CodSpeed's + # macro runners are ARM64 and can't execute the x86 paths. + - name: walltime-baseline + mode: walltime + tag: bench-walltime + rustflags: "" + build-args: "--bench bit_transpose" + - name: walltime-bmi2 + mode: walltime + tag: bench-walltime + rustflags: "-C target-feature=+bmi2" + build-args: "--bench bit_transpose" + - name: walltime-avx512 + mode: walltime + tag: bench-walltime + rustflags: "-C target-feature=+avx512f,+avx512bw,+avx512vbmi" + build-args: "--bench bit_transpose" steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -60,13 +92,13 @@ jobs: - name: Build benchmarks env: - RUSTFLAGS: "-C target-feature=+avx2" + RUSTFLAGS: ${{ matrix.rustflags }} run: | - cargo codspeed build --profile bench --features std + cargo codspeed build ${{ matrix.build-args }} --profile bench --features std -m ${{ matrix.mode }} - name: Run benchmarks uses: CodSpeedHQ/action@3194d9a39c4d46684cb44bf7207fc56626aad8fd # v4 with: - run: cargo codspeed run + run: cargo codspeed run -m ${{ matrix.mode }} token: ${{ secrets.CODSPEED_TOKEN }} - mode: simulation + mode: ${{ matrix.mode }} diff --git a/Cargo.toml b/Cargo.toml index 75355db..addd88b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,12 @@ edition = "2021" rust-version = "1.91" exclude = [".github/*", "renovate.json"] +[workspace] +# `bench-macros` is a dev-only proc-macro crate (used by `benches/bit_transpose.rs`). +# Keep it out of the workspace so it stays clear of the published crate, MSRV check, +# and release tooling; it is still built via the path dev-dependency below. +exclude = ["bench-macros"] + [features] default = [] # Enables runtime CPU feature detection for `transpose_bits`/`untranspose_bits` @@ -28,6 +34,7 @@ seq-macro = "0.3.5" [dev-dependencies] arrayref = "0.3.9" +bench-macros = { path = "bench-macros" } divan = { package = "codspeed-divan-compat", version = "4.0" } [lints.rust] diff --git a/bench-macros/Cargo.toml b/bench-macros/Cargo.toml new file mode 100644 index 0000000..61c6e60 --- /dev/null +++ b/bench-macros/Cargo.toml @@ -0,0 +1,10 @@ +# Dev-only proc-macro crate for the benchmarks. Excluded from the fastlanes +# workspace and never published (it's only a `[dev-dependencies]` path dep). +[package] +name = "bench-macros" +version = "0.0.0" +edition = "2021" +publish = false + +[lib] +proc-macro = true diff --git a/bench-macros/src/lib.rs b/bench-macros/src/lib.rs new file mode 100644 index 0000000..a062ad8 --- /dev/null +++ b/bench-macros/src/lib.rs @@ -0,0 +1,32 @@ +//! Dev-only attribute macro for the bit-transpose benchmarks. +//! +//! `#[bench()]`, placed above `#[divan::bench]`, expands to the +//! `#[cfg(target_feature = …)]` gate for one Intel feature tier. The gates are +//! mutually exclusive, so every benchmark is compiled — and therefore run — by +//! exactly one CI matrix entry, i.e. on exactly one runner. + +use proc_macro::TokenStream; + +/// Gate a benchmark on its Intel feature tier: `baseline` (no SIMD feature), +/// `bmi2`, or `avx512` (AVX-512 VBMI). +#[proc_macro_attribute] +pub fn bench(attr: TokenStream, item: TokenStream) -> TokenStream { + let gate = match attr.to_string().trim() { + "baseline" => r#"#[cfg(not(any(target_feature = "bmi2", target_feature = "avx512vbmi")))]"#, + "bmi2" => r#"#[cfg(all(target_feature = "bmi2", not(target_feature = "avx512vbmi")))]"#, + "avx512" => r#"#[cfg(target_feature = "avx512vbmi")]"#, + other => { + return format!( + "compile_error!(\"#[bench(..)] expects `baseline`, `bmi2`, or `avx512`, got `{other}`\");" + ) + .parse() + .expect("compile_error! is valid tokens"); + } + }; + + // Prepend the cfg gate; the rest of the attributes (e.g. `#[divan::bench]`) + // and the function itself are left untouched. + let mut out: TokenStream = gate.parse().expect("cfg attribute is valid tokens"); + out.extend(item); + out +} diff --git a/benches/bit_transpose.rs b/benches/bit_transpose.rs index fcb4e6a..3c073cc 100644 --- a/benches/bit_transpose.rs +++ b/benches/bit_transpose.rs @@ -2,9 +2,9 @@ use std::hint::black_box; use arrayref::array_mut_ref; use arrayref::array_ref; +use bench_macros::bench; use divan::counter::BytesCount; use divan::Bencher; -use fastlanes::FastLanes; fn main() { divan::main(); @@ -21,7 +21,7 @@ fn make_input() -> Vec { .collect() } -/// Run `op` over every 1024-bit block of the buffer. +/// Shared driver: run `op` over every 1024-bit block of the buffer. fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) { let input = make_input(); let mut output = vec![0u64; U64S]; @@ -36,6 +36,17 @@ fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) { }); } +// `#[bench()]` (from the `bench-macros` dev-dependency) gates each benchmark +// on its Intel feature tier, so it compiles — and therefore runs — only in the CI +// matrix entry built with that `-C target-feature` flag. The tiers are mutually +// exclusive, so every benchmark has exactly one home runner: +// * baseline (scalar / dispatch) → the `walltime-baseline` runner +// * bmi2 → the `walltime-bmi2` runner +// * avx512 vbmi → the `walltime-avx512` runner +// Locally, `cargo bench` builds the baseline tier; pass the matching +// `RUSTFLAGS="-C target-feature=+…"` (or `-C target-cpu=native`) to run a SIMD tier. + +#[bench(baseline)] #[divan::bench] fn scalar_transpose(bencher: Bencher) { bench_blocks(bencher, fastlanes::scalar::transpose_bits); @@ -44,68 +55,68 @@ fn scalar_transpose(bencher: Bencher) { /// Untranspose is generic over the element width `T`; benchmark each width separately. The mask /// always factors into 16 groups of 8 bytes regardless of `T`, so per-arch the widths should be /// within noise of one another (only the gather/scatter index tables differ). +#[bench(baseline)] #[divan::bench(types = [u8, u16, u32, u64])] -fn scalar_untranspose(bencher: Bencher) { +fn scalar_untranspose(bencher: Bencher) { bench_blocks(bencher, fastlanes::scalar::untranspose_bits::); } +#[bench(baseline)] #[divan::bench] fn dispatch_transpose(bencher: Bencher) { bench_blocks(bencher, fastlanes::transpose_bits); } +#[bench(baseline)] #[divan::bench(types = [u8, u16, u32, u64])] -fn dispatch_untranspose(bencher: Bencher) { +fn dispatch_untranspose(bencher: Bencher) { bench_blocks(bencher, fastlanes::untranspose_bits::); } -#[cfg(target_arch = "x86_64")] +#[cfg(all( + target_arch = "x86_64", + any(target_feature = "bmi2", target_feature = "avx512vbmi") +))] mod x86 { - use super::{bench_blocks, Bencher}; + use bench_macros::bench; use fastlanes::x86; + use super::{bench_blocks, Bencher}; + + #[bench(bmi2)] #[divan::bench] fn bmi2_transpose(bencher: Bencher) { - if !x86::has_bmi2() { - return; - } - // SAFETY: guarded by `has_bmi2`. + // SAFETY: this benchmark only compiles with `+bmi2`. bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_bmi2(i, o) }); } + #[bench(bmi2)] #[divan::bench] fn bmi2_untranspose(bencher: Bencher) { - if !x86::has_bmi2() { - return; - } - // SAFETY: guarded by `has_bmi2`. + // SAFETY: this benchmark only compiles with `+bmi2`. bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_bmi2(i, o) }); } + #[bench(avx512)] #[divan::bench] fn vbmi_transpose(bencher: Bencher) { - if !x86::has_vbmi() { - return; - } - // SAFETY: guarded by `has_vbmi`. + // SAFETY: this benchmark only compiles with `+avx512vbmi`. bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_vbmi(i, o) }); } + #[bench(avx512)] #[divan::bench] fn vbmi_untranspose(bencher: Bencher) { - if !x86::has_vbmi() { - return; - } - // SAFETY: guarded by `has_vbmi`. + // SAFETY: this benchmark only compiles with `+avx512vbmi`. bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_vbmi(i, o) }); } } #[cfg(target_arch = "aarch64")] mod aarch64 { - use super::{bench_blocks, Bencher}; use fastlanes::aarch64; - use fastlanes::FastLanes; + + use super::{bench_blocks, Bencher}; #[divan::bench] fn neon_transpose(bencher: Bencher) { @@ -116,7 +127,7 @@ mod aarch64 { } #[divan::bench(types = [u8, u16, u32, u64])] - fn neon_untranspose(bencher: Bencher) { + fn neon_untranspose(bencher: Bencher) { // SAFETY: NEON is always available on aarch64. bench_blocks(bencher, |i, o| unsafe { aarch64::untranspose_bits_neon::(i, o)