Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,44 @@ jobs:
run: cargo msrv verify

bench-codspeed:
name: Benchmark with Codspeed
name: Benchmark with Codspeed (${{ matrix.name }})
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
- family=c8i.8xlarge
- image=ubuntu24-full-x64
- tag=bench-codspeed
- tag=${{ matrix.tag }}
strategy:
fail-fast: false
matrix:
include:
# Deterministic instruction-count benchmarks under Valgrind, for every suite
# EXCEPT bit_transpose (which is measured per-tier in walltime below, so it has
# a single home runner). Keep this list in sync with the [[bench]] targets.
- name: simulation
mode: simulation
tag: bench-codspeed
rustflags: "-C target-feature=+avx2"
build-args: "--bench bitpacking --bench ffor --bench bitpacking_cmp --bench delta --bench transpose --bench rle"
# Real wall-clock timings, one runner per Intel feature tier. Each tier is built
# with its own -C target-feature flag, and the bit_transpose benches are gated on
# that feature via #[bench(<tier>)], so each one compiles — and runs — on exactly
# one runner. Measured on runs-on x86 (c8i / Granite Rapids) since CodSpeed's
# macro runners are ARM64 and can't execute the x86 paths.
- name: walltime-baseline
mode: walltime
tag: bench-walltime
rustflags: ""
build-args: "--bench bit_transpose"
- name: walltime-bmi2
mode: walltime
tag: bench-walltime
rustflags: "-C target-feature=+bmi2"
build-args: "--bench bit_transpose"
- name: walltime-avx512
mode: walltime
tag: bench-walltime
rustflags: "-C target-feature=+avx512f,+avx512bw,+avx512vbmi"
build-args: "--bench bit_transpose"

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
Expand All @@ -60,13 +92,13 @@ jobs:

- name: Build benchmarks
env:
RUSTFLAGS: "-C target-feature=+avx2"
RUSTFLAGS: ${{ matrix.rustflags }}
run: |
cargo codspeed build --profile bench --features std
cargo codspeed build ${{ matrix.build-args }} --profile bench --features std -m ${{ matrix.mode }}

- name: Run benchmarks
uses: CodSpeedHQ/action@3194d9a39c4d46684cb44bf7207fc56626aad8fd # v4
with:
run: cargo codspeed run
run: cargo codspeed run -m ${{ matrix.mode }}
token: ${{ secrets.CODSPEED_TOKEN }}
mode: simulation
mode: ${{ matrix.mode }}
7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ edition = "2021"
rust-version = "1.91"
exclude = [".github/*", "renovate.json"]

[workspace]
# `bench-macros` is a dev-only proc-macro crate (used by `benches/bit_transpose.rs`).
# Keep it out of the workspace so it stays clear of the published crate, MSRV check,
# and release tooling; it is still built via the path dev-dependency below.
exclude = ["bench-macros"]

[features]
default = []
# Enables runtime CPU feature detection for `transpose_bits`/`untranspose_bits`
Expand All @@ -28,6 +34,7 @@ seq-macro = "0.3.5"

[dev-dependencies]
arrayref = "0.3.9"
bench-macros = { path = "bench-macros" }
divan = { package = "codspeed-divan-compat", version = "4.0" }

[lints.rust]
Expand Down
10 changes: 10 additions & 0 deletions bench-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Dev-only proc-macro crate for the benchmarks. Excluded from the fastlanes
# workspace and never published (it's only a `[dev-dependencies]` path dep).
[package]
name = "bench-macros"
version = "0.0.0"
edition = "2021"
publish = false

[lib]
proc-macro = true
32 changes: 32 additions & 0 deletions bench-macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//! Dev-only attribute macro for the bit-transpose benchmarks.
//!
//! `#[bench(<tier>)]`, placed above `#[divan::bench]`, expands to the
//! `#[cfg(target_feature = …)]` gate for one Intel feature tier. The gates are
//! mutually exclusive, so every benchmark is compiled — and therefore run — by
//! exactly one CI matrix entry, i.e. on exactly one runner.

use proc_macro::TokenStream;

/// Gate a benchmark on its Intel feature tier: `baseline` (no SIMD feature),
/// `bmi2`, or `avx512` (AVX-512 VBMI).
#[proc_macro_attribute]
pub fn bench(attr: TokenStream, item: TokenStream) -> TokenStream {
let gate = match attr.to_string().trim() {
"baseline" => r#"#[cfg(not(any(target_feature = "bmi2", target_feature = "avx512vbmi")))]"#,
"bmi2" => r#"#[cfg(all(target_feature = "bmi2", not(target_feature = "avx512vbmi")))]"#,
"avx512" => r#"#[cfg(target_feature = "avx512vbmi")]"#,
other => {
return format!(
"compile_error!(\"#[bench(..)] expects `baseline`, `bmi2`, or `avx512`, got `{other}`\");"
)
.parse()
.expect("compile_error! is valid tokens");
}
};

// Prepend the cfg gate; the rest of the attributes (e.g. `#[divan::bench]`)
// and the function itself are left untouched.
let mut out: TokenStream = gate.parse().expect("cfg attribute is valid tokens");
out.extend(item);
out
}
61 changes: 36 additions & 25 deletions benches/bit_transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use std::hint::black_box;

use arrayref::array_mut_ref;
use arrayref::array_ref;
use bench_macros::bench;
use divan::counter::BytesCount;
use divan::Bencher;
use fastlanes::FastLanes;

fn main() {
divan::main();
Expand All @@ -21,7 +21,7 @@ fn make_input() -> Vec<u64> {
.collect()
}

/// Run `op` over every 1024-bit block of the buffer.
/// Shared driver: run `op` over every 1024-bit block of the buffer.
fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) {
let input = make_input();
let mut output = vec![0u64; U64S];
Expand All @@ -36,6 +36,17 @@ fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) {
});
}

// `#[bench(<tier>)]` (from the `bench-macros` dev-dependency) gates each benchmark
// on its Intel feature tier, so it compiles — and therefore runs — only in the CI
// matrix entry built with that `-C target-feature` flag. The tiers are mutually
// exclusive, so every benchmark has exactly one home runner:
// * baseline (scalar / dispatch) → the `walltime-baseline` runner
// * bmi2 → the `walltime-bmi2` runner
// * avx512 vbmi → the `walltime-avx512` runner
// Locally, `cargo bench` builds the baseline tier; pass the matching
// `RUSTFLAGS="-C target-feature=+…"` (or `-C target-cpu=native`) to run a SIMD tier.

#[bench(baseline)]
#[divan::bench]
fn scalar_transpose(bencher: Bencher) {
bench_blocks(bencher, fastlanes::scalar::transpose_bits);
Expand All @@ -44,68 +55,68 @@ fn scalar_transpose(bencher: Bencher) {
/// Untranspose is generic over the element width `T`; benchmark each width separately. The mask
/// always factors into 16 groups of 8 bytes regardless of `T`, so per-arch the widths should be
/// within noise of one another (only the gather/scatter index tables differ).
#[bench(baseline)]
#[divan::bench(types = [u8, u16, u32, u64])]
fn scalar_untranspose<T: FastLanes>(bencher: Bencher) {
fn scalar_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
bench_blocks(bencher, fastlanes::scalar::untranspose_bits::<T>);
}

#[bench(baseline)]
#[divan::bench]
fn dispatch_transpose(bencher: Bencher) {
bench_blocks(bencher, fastlanes::transpose_bits);
}

#[bench(baseline)]
#[divan::bench(types = [u8, u16, u32, u64])]
fn dispatch_untranspose<T: FastLanes>(bencher: Bencher) {
fn dispatch_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
bench_blocks(bencher, fastlanes::untranspose_bits::<T>);
}

#[cfg(target_arch = "x86_64")]
#[cfg(all(
target_arch = "x86_64",
any(target_feature = "bmi2", target_feature = "avx512vbmi")
))]
mod x86 {
use super::{bench_blocks, Bencher};
use bench_macros::bench;
use fastlanes::x86;

use super::{bench_blocks, Bencher};

#[bench(bmi2)]
#[divan::bench]
fn bmi2_transpose(bencher: Bencher) {
if !x86::has_bmi2() {
return;
}
// SAFETY: guarded by `has_bmi2`.
// SAFETY: this benchmark only compiles with `+bmi2`.
bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_bmi2(i, o) });
}

#[bench(bmi2)]
#[divan::bench]
fn bmi2_untranspose(bencher: Bencher) {
if !x86::has_bmi2() {
return;
}
// SAFETY: guarded by `has_bmi2`.
// SAFETY: this benchmark only compiles with `+bmi2`.
bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_bmi2(i, o) });
}

#[bench(avx512)]
#[divan::bench]
fn vbmi_transpose(bencher: Bencher) {
if !x86::has_vbmi() {
return;
}
// SAFETY: guarded by `has_vbmi`.
// SAFETY: this benchmark only compiles with `+avx512vbmi`.
bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_vbmi(i, o) });
}

#[bench(avx512)]
#[divan::bench]
fn vbmi_untranspose(bencher: Bencher) {
if !x86::has_vbmi() {
return;
}
// SAFETY: guarded by `has_vbmi`.
// SAFETY: this benchmark only compiles with `+avx512vbmi`.
bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_vbmi(i, o) });
}
}

#[cfg(target_arch = "aarch64")]
mod aarch64 {
use super::{bench_blocks, Bencher};
use fastlanes::aarch64;
use fastlanes::FastLanes;

use super::{bench_blocks, Bencher};

#[divan::bench]
fn neon_transpose(bencher: Bencher) {
Expand All @@ -116,7 +127,7 @@ mod aarch64 {
}

#[divan::bench(types = [u8, u16, u32, u64])]
fn neon_untranspose<T: FastLanes>(bencher: Bencher) {
fn neon_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
// SAFETY: NEON is always available on aarch64.
bench_blocks(bencher, |i, o| unsafe {
aarch64::untranspose_bits_neon::<T>(i, o)
Expand Down
Loading