diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22834c6..cf25cad 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,12 +43,44 @@ jobs:
         run: cargo msrv verify
 
   bench-codspeed:
-    name: Benchmark with Codspeed
+    name: Benchmark with Codspeed (${{ matrix.name }})
     runs-on:
       - runs-on=${{ github.run_id }}
-      - family=c6id.8xlarge
+      - family=c8i.8xlarge
       - image=ubuntu24-full-x64
-      - tag=bench-codspeed
+      - tag=${{ matrix.tag }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # Deterministic instruction-count benchmarks under Valgrind, for every suite
+          # EXCEPT bit_transpose (which is measured per-tier in walltime below, so it has
+          # a single home runner). Keep this list in sync with the [[bench]] targets.
+          - name: simulation
+            mode: simulation
+            tag: bench-codspeed
+            rustflags: "-C target-feature=+avx2"
+            build-args: "--bench bitpacking --bench ffor --bench bitpacking_cmp --bench delta --bench transpose --bench rle"
+          # Real wall-clock timings, one runner per Intel feature tier. Each tier is built
+          # with its own -C target-feature flag, and the bit_transpose benches are gated on
+          # that feature via #[bench(<tier>)], so each one compiles — and runs — on exactly
+          # one runner. Measured on runs-on x86 (c8i / Granite Rapids) since CodSpeed's
+          # macro runners are ARM64 and can't execute the x86 paths.
+          - name: walltime-baseline
+            mode: walltime
+            tag: bench-walltime
+            rustflags: ""
+            build-args: "--bench bit_transpose"
+          - name: walltime-bmi2
+            mode: walltime
+            tag: bench-walltime
+            rustflags: "-C target-feature=+bmi2"
+            build-args: "--bench bit_transpose"
+          - name: walltime-avx512
+            mode: walltime
+            tag: bench-walltime
+            rustflags: "-C target-feature=+avx512f,+avx512bw,+avx512vbmi"
+            build-args: "--bench bit_transpose"
 
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
@@ -60,13 +92,13 @@ jobs:
 
       - name: Build benchmarks
         env:
-          RUSTFLAGS: "-C target-feature=+avx2"
+          RUSTFLAGS: ${{ matrix.rustflags }}
         run: |
-          cargo codspeed build --profile bench --features std
+          cargo codspeed build ${{ matrix.build-args }} --profile bench --features std -m ${{ matrix.mode }}
 
       - name: Run benchmarks
         uses: CodSpeedHQ/action@3194d9a39c4d46684cb44bf7207fc56626aad8fd # v4
         with:
-          run: cargo codspeed run
+          run: cargo codspeed run -m ${{ matrix.mode }}
           token: ${{ secrets.CODSPEED_TOKEN }}
-          mode: simulation
+          mode: ${{ matrix.mode }}
diff --git a/Cargo.toml b/Cargo.toml
index 75355db..addd88b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,12 @@ edition = "2021"
 rust-version = "1.91"
 exclude = [".github/*", "renovate.json"]
 
+[workspace]
+# `bench-macros` is a dev-only proc-macro crate (used by `benches/bit_transpose.rs`).
+# Keep it out of the workspace so it stays clear of the published crate, MSRV check,
+# and release tooling; it is still built via the path dev-dependency below.
+exclude = ["bench-macros"]
+
 [features]
 default = []
 # Enables runtime CPU feature detection for `transpose_bits`/`untranspose_bits`
@@ -28,6 +34,7 @@ seq-macro = "0.3.5"
 
 [dev-dependencies]
 arrayref = "0.3.9"
+bench-macros = { path = "bench-macros" }
 divan = { package = "codspeed-divan-compat", version = "4.0" }
 
 [lints.rust]
diff --git a/bench-macros/Cargo.toml b/bench-macros/Cargo.toml
new file mode 100644
index 0000000..61c6e60
--- /dev/null
+++ b/bench-macros/Cargo.toml
@@ -0,0 +1,10 @@
+# Dev-only proc-macro crate for the benchmarks. Excluded from the fastlanes
+# workspace and never published (it's only a `[dev-dependencies]` path dep).
+[package]
+name = "bench-macros"
+version = "0.0.0"
+edition = "2021"
+publish = false
+
+[lib]
+proc-macro = true
diff --git a/bench-macros/src/lib.rs b/bench-macros/src/lib.rs
new file mode 100644
index 0000000..a062ad8
--- /dev/null
+++ b/bench-macros/src/lib.rs
@@ -0,0 +1,32 @@
+//! Dev-only attribute macro for the bit-transpose benchmarks.
+//!
+//! `#[bench(<tier>)]`, placed above `#[divan::bench]`, expands to the
+//! `#[cfg(target_feature = …)]` gate for one Intel feature tier. The gates are
+//! mutually exclusive, so every benchmark is compiled — and therefore run — by
+//! exactly one CI matrix entry, i.e. on exactly one runner.
+
+use proc_macro::TokenStream;
+
+/// Gate a benchmark on its Intel feature tier: `baseline` (no SIMD feature),
+/// `bmi2`, or `avx512` (AVX-512 VBMI).
+#[proc_macro_attribute]
+pub fn bench(attr: TokenStream, item: TokenStream) -> TokenStream {
+    let gate = match attr.to_string().trim() {
+        "baseline" => r#"#[cfg(not(any(target_feature = "bmi2", target_feature = "avx512vbmi")))]"#,
+        "bmi2" => r#"#[cfg(all(target_feature = "bmi2", not(target_feature = "avx512vbmi")))]"#,
+        "avx512" => r#"#[cfg(target_feature = "avx512vbmi")]"#,
+        other => {
+            return format!(
+                "compile_error!(\"#[bench(..)] expects `baseline`, `bmi2`, or `avx512`, got `{other}`\");"
+            )
+            .parse()
+            .expect("compile_error! is valid tokens");
+        }
+    };
+
+    // Prepend the cfg gate; the rest of the attributes (e.g. `#[divan::bench]`)
+    // and the function itself are left untouched.
+    let mut out: TokenStream = gate.parse().expect("cfg attribute is valid tokens");
+    out.extend(item);
+    out
+}
diff --git a/benches/bit_transpose.rs b/benches/bit_transpose.rs
index fcb4e6a..3c073cc 100644
--- a/benches/bit_transpose.rs
+++ b/benches/bit_transpose.rs
@@ -2,9 +2,9 @@ use std::hint::black_box;
 
 use arrayref::array_mut_ref;
 use arrayref::array_ref;
+use bench_macros::bench;
 use divan::counter::BytesCount;
 use divan::Bencher;
-use fastlanes::FastLanes;
 
 fn main() {
     divan::main();
@@ -21,7 +21,7 @@ fn make_input() -> Vec<u64> {
         .collect()
 }
 
-/// Run `op` over every 1024-bit block of the buffer.
+/// Shared driver: run `op` over every 1024-bit block of the buffer.
 fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) {
     let input = make_input();
     let mut output = vec![0u64; U64S];
@@ -36,6 +36,17 @@ fn bench_blocks(bencher: Bencher, op: impl Fn(&[u64; 16], &mut [u64; 16])) {
     });
 }
 
+// `#[bench(<tier>)]` (from the `bench-macros` dev-dependency) gates each benchmark
+// on its Intel feature tier, so it compiles — and therefore runs — only in the CI
+// matrix entry built with that `-C target-feature` flag. The tiers are mutually
+// exclusive, so every benchmark has exactly one home runner:
+//   * baseline (scalar / dispatch) → the `walltime-baseline` runner
+//   * bmi2                         → the `walltime-bmi2` runner
+//   * avx512 vbmi                  → the `walltime-avx512` runner
+// Locally, `cargo bench` builds the baseline tier; pass the matching
+// `RUSTFLAGS="-C target-feature=+…"` (or `-C target-cpu=native`) to run a SIMD tier.
+
+#[bench(baseline)]
 #[divan::bench]
 fn scalar_transpose(bencher: Bencher) {
     bench_blocks(bencher, fastlanes::scalar::transpose_bits);
@@ -44,68 +55,68 @@ fn scalar_transpose(bencher: Bencher) {
 /// Untranspose is generic over the element width `T`; benchmark each width separately. The mask
 /// always factors into 16 groups of 8 bytes regardless of `T`, so per-arch the widths should be
 /// within noise of one another (only the gather/scatter index tables differ).
+#[bench(baseline)]
 #[divan::bench(types = [u8, u16, u32, u64])]
-fn scalar_untranspose<T: FastLanes>(bencher: Bencher) {
+fn scalar_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
     bench_blocks(bencher, fastlanes::scalar::untranspose_bits::<T>);
 }
 
+#[bench(baseline)]
 #[divan::bench]
 fn dispatch_transpose(bencher: Bencher) {
     bench_blocks(bencher, fastlanes::transpose_bits);
 }
 
+#[bench(baseline)]
 #[divan::bench(types = [u8, u16, u32, u64])]
-fn dispatch_untranspose<T: FastLanes>(bencher: Bencher) {
+fn dispatch_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
     bench_blocks(bencher, fastlanes::untranspose_bits::<T>);
 }
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(
+    target_arch = "x86_64",
+    any(target_feature = "bmi2", target_feature = "avx512vbmi")
+))]
 mod x86 {
-    use super::{bench_blocks, Bencher};
+    use bench_macros::bench;
     use fastlanes::x86;
 
+    use super::{bench_blocks, Bencher};
+
+    #[bench(bmi2)]
     #[divan::bench]
     fn bmi2_transpose(bencher: Bencher) {
-        if !x86::has_bmi2() {
-            return;
-        }
-        // SAFETY: guarded by `has_bmi2`.
+        // SAFETY: this benchmark only compiles with `+bmi2`.
         bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_bmi2(i, o) });
     }
 
+    #[bench(bmi2)]
     #[divan::bench]
     fn bmi2_untranspose(bencher: Bencher) {
-        if !x86::has_bmi2() {
-            return;
-        }
-        // SAFETY: guarded by `has_bmi2`.
+        // SAFETY: this benchmark only compiles with `+bmi2`.
         bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_bmi2(i, o) });
     }
 
+    #[bench(avx512)]
     #[divan::bench]
     fn vbmi_transpose(bencher: Bencher) {
-        if !x86::has_vbmi() {
-            return;
-        }
-        // SAFETY: guarded by `has_vbmi`.
+        // SAFETY: this benchmark only compiles with `+avx512vbmi`.
         bench_blocks(bencher, |i, o| unsafe { x86::transpose_bits_vbmi(i, o) });
     }
 
+    #[bench(avx512)]
     #[divan::bench]
     fn vbmi_untranspose(bencher: Bencher) {
-        if !x86::has_vbmi() {
-            return;
-        }
-        // SAFETY: guarded by `has_vbmi`.
+        // SAFETY: this benchmark only compiles with `+avx512vbmi`.
         bench_blocks(bencher, |i, o| unsafe { x86::untranspose_bits_vbmi(i, o) });
     }
 }
 
 #[cfg(target_arch = "aarch64")]
 mod aarch64 {
-    use super::{bench_blocks, Bencher};
     use fastlanes::aarch64;
-    use fastlanes::FastLanes;
+
+    use super::{bench_blocks, Bencher};
 
     #[divan::bench]
     fn neon_transpose(bencher: Bencher) {
@@ -116,7 +127,7 @@ mod aarch64 {
     }
 
     #[divan::bench(types = [u8, u16, u32, u64])]
-    fn neon_untranspose<T: FastLanes>(bencher: Bencher) {
+    fn neon_untranspose<T: fastlanes::FastLanes>(bencher: Bencher) {
         // SAFETY: NEON is always available on aarch64.
         bench_blocks(bencher, |i, o| unsafe {
             aarch64::untranspose_bits_neon::<T>(i, o)