vortex-data · joseph-isaacs · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -227,6 +227,7 @@ roaring = "0.11.0"
 rstest = "0.26.1"
 rstest_reuse = "0.7.0"
 rustc-hash = "2.1"
+seq-macro = "0.3.6"
 serde = "1.0.220"
 serde_json = "1.0.138"
 serde_test = "1.0.176"

diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
@@ -21,8 +21,10 @@ fastlanes = { workspace = true }
 itertools = { workspace = true }
 lending-iterator = { workspace = true }
 num-traits = { workspace = true }
+paste = { workspace = true }
 prost = { workspace = true }
 rand = { workspace = true, optional = true }
+seq-macro = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
@@ -60,6 +62,10 @@ name = "bit_transpose"
 harness = false
 required-features = ["_test-harness"]
 
+[[bench]]
+name = "bitpack_constant"
+harness = false
+
 [[bench]]
 name = "bitpack_compare"
 harness = false

diff --git a/encodings/fastlanes/benches/bitpack_compare.rs b/encodings/fastlanes/benches/bitpack_compare.rs
@@ -51,6 +51,53 @@ fn build_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx)
     (array, rhs, ctx)
 }
 
+/// Build the same packed array but with an *in-range* constant RHS, so the streaming /
+/// fused unpack-compare path runs (the out-of-range fast path does not apply).
+fn build_in_range_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let buf: BufferMut<u32> = (0..len).map(|i| (i as u32) % (1 << BW)).collect();
+    let array = BitPackedData::encode(
+        &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(),
+        BW,
+        &mut ctx,
+    )
+    .unwrap()
+    .into_array();
+    // Mid-range constant: inside [0, 2^BW - 1], so every lane must actually be inspected.
+    let constant = (1u32 << BW) / 2;
+    let rhs = ConstantArray::new(constant, len).into_array();
+    (array, rhs, ctx)
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn in_range_eq<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_in_range_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        array
+            .clone()
+            .binary(rhs.clone(), Operator::Eq)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn in_range_eq_baseline<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_in_range_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        // What the fallback would do: materialize the unpacked primitive, then run Arrow
+        // compare on it.
+        let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        primitive
+            .into_array()
+            .binary(rhs.clone(), Operator::Eq)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
+
 #[divan::bench(args = LENS, consts = BIT_WIDTHS)]
 fn fast_eq_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
     let (array, rhs, mut ctx) = build_inputs::<BW>(len);

diff --git a/encodings/fastlanes/benches/bitpack_constant.rs b/encodings/fastlanes/benches/bitpack_constant.rs
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compare the fast constant bit-packing path against the standard `bitpack_encode`
+//! pipeline on a uniform-constant input.
+//!
+//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_constant`.
+
+#![expect(clippy::unwrap_used)]
+
+use divan::Bencher;
+use divan::black_box;
+use divan::counter::ItemsCount;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::bitpack_compress::bitpack_encode;
+use vortex_fastlanes::bitpack_compress::bitpack_encode_constant;
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[1024, 64 * 1024];
+const BIT_WIDTHS: &[u8] = &[4, 16];
+
+const CONSTANT: u32 = 7;
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn full_encode<const BW: u8>(bencher: Bencher, len: usize) {
+    let buf: BufferMut<u32> = (0..len).map(|_| CONSTANT).collect();
+    let arr = PrimitiveArray::new(buf.freeze(), Validity::NonNullable);
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+
+    bencher
+        .counter(ItemsCount::new(len))
+        .bench_local(|| bitpack_encode(black_box(&arr), black_box(BW), None, &mut ctx).unwrap());
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn fast_encode<const BW: u8>(bencher: Bencher, len: usize) {
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        bitpack_encode_constant::<u32>(
+            black_box(CONSTANT),
+            black_box(BW),
+            black_box(len),
+            Validity::NonNullable,
+        )
+        .unwrap()
+    });
+}
diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs
@@ -194,6 +194,143 @@ pub fn bitpack_primitive<T: NativePType + BitPacking>(array: &[T], bit_width: u8
     output.freeze()
 }
 
+/// Build the bit-packed buffer for a `[constant; len]` input without calling the
+/// SIMD packer.
+///
+/// The FastLanes packing kernel runs `LANES` independent lane packers in parallel, each
+/// consuming `T = 8 * size_of::<T>()` input values and producing `bit_width` output words.
+/// When every input value equals `constant`, all `LANES` lane packers produce the same
+/// `bit_width` words. We compute those words analytically — looping over `T` bits per
+/// output word with a single `OR`/shift — then replicate the lane pattern across the
+/// chunk and the chunk pattern across the buffer with `memset`/`memcpy`. No call to
+/// `BitPacking::pack` is involved for any full chunk.
+///
+/// The trailing partial chunk (when `len % 1024 != 0`) is zero-padded past `len`, so it
+/// has a different pattern than the full template. It is built by re-using the analytical
+/// kernel only when `len % 1024` is itself a multiple of `T` (so the padded boundary
+/// aligns with a lane row); otherwise we fall back to a single `unchecked_pack` call for
+/// that final chunk only.
+///
+/// # Preconditions
+///
+/// * `constant` must fit in `bit_width`, i.e., `(constant as u64) < (1 << bit_width)`.
+/// * `0 < bit_width <= size_of::<T>() * 8`.
+pub fn bitpack_constant<T: NativePType + BitPacking>(
+    constant: T,
+    bit_width: u8,
+    len: usize,
+) -> Buffer<T> {
+    if bit_width == 0 || len == 0 {
+        return Buffer::<T>::empty();
+    }
+    let w = bit_width as usize;
+    let t_bits = 8 * size_of::<T>();
+    let lanes = 1024 / t_bits;
+    let packed_len = 128 * w / size_of::<T>();
+    debug_assert_eq!(packed_len, w * lanes);
+
+    let num_chunks = len.div_ceil(1024);
+    let num_full_chunks = len / 1024;
+
+    let mut output = BufferMut::<T>::with_capacity(num_chunks * packed_len);
+
+    if num_full_chunks > 0 {
+        // One full chunk's bit pattern: `w` distinct output words, each replicated `lanes`
+        // times. Build the template on the stack with `lane_word`-sized `memset`s, then
+        // `memcpy` it into the output for every full chunk.
+        let lane_words = constant_lane_words::<T>(constant, w);
+        let mut chunk: [T; 1024] = [T::zero(); 1024];
+        for (k, &word) in lane_words.iter().enumerate() {
+            chunk[k * lanes..(k + 1) * lanes].fill(word);
+        }
+        let template = &chunk[..packed_len];
+        for _ in 0..num_full_chunks {
+            output.extend_from_slice(template);
+        }
+    }
+
+    if num_chunks > num_full_chunks {
+        // Tail chunk gets zero-padded past `len % 1024`, so it differs from the full
+        // template. Use the standard packer for this single chunk.
+        let last_chunk_size = len % 1024;
+        let mut last_chunk: [T; 1024] = [T::zero(); 1024];
+        last_chunk[..last_chunk_size].fill(constant);
+        let tail_start = output.len();
+        unsafe {
+            output.set_len(tail_start + packed_len);
+            BitPacking::unchecked_pack(w, &last_chunk, &mut output[tail_start..][..packed_len]);
+        }
+    }
+
+    output.freeze()
+}
+
+/// Compute the `bit_width` output words that every FastLanes lane produces when packing
+/// `T = 8 * size_of::<T>()` copies of `constant`.
+///
+/// For constant input, each lane packs a periodic bit-stream of period `bit_width` made
+/// of the low `bit_width` bits of `constant`. Output word `k` contains bits
+/// `[k * T, (k + 1) * T)` of that stream, so its `j`-th bit equals bit
+/// `(k * T + j) mod bit_width` of `constant`.
+fn constant_lane_words<T: NativePType + BitPacking>(constant: T, bit_width: usize) -> Vec<T> {
+    let t_bits = 8 * size_of::<T>();
+    let mask = if bit_width == t_bits {
+        !T::zero()
+    } else {
+        (T::one() << bit_width) - T::one()
+    };
+    let s = constant & mask;
+    (0..bit_width)
+        .map(|k| {
+            let mut word = T::zero();
+            for j in 0..t_bits {
+                let bit_in_s = (k * t_bits + j) % bit_width;
+                let bit = (s >> bit_in_s) & T::one();
+                word = word | (bit << j);
+            }
+            word
+        })
+        .collect()
+}
+
+/// Encode a length-`len` array of `constant` values as a [`BitPackedArray`] without
+/// running the standard encode pipeline.
+///
+/// Returns an error if `constant` does not fit in `bit_width`, or if `bit_width` is too
+/// large for `T`.
+pub fn bitpack_encode_constant<T: NativePType + BitPacking + num_traits::ToPrimitive>(
+    constant: T,
+    bit_width: u8,
+    len: usize,
+    validity: Validity,
+) -> VortexResult<BitPackedArray> {
+    if bit_width as usize >= T::PTYPE.bit_width() {
+        vortex_bail!(
+            InvalidArgument: "Cannot pack - specified bit width {bit_width} >= {}",
+            T::PTYPE.bit_width()
+        );
+    }
+    let c = constant
+        .to_i128()
+        .ok_or_else(|| vortex_error::vortex_err!("cannot cast constant to i128"))?;
+    if c < 0 || c > (1i128 << bit_width) - 1 {
+        vortex_bail!(
+            InvalidArgument: "constant {c} does not fit in bit_width {bit_width}"
+        );
+    }
+
+    let packed = bitpack_constant(constant, bit_width, len).into_byte_buffer();
+    BitPacked::try_new(
+        BufferHandle::new_host(packed),
+        T::PTYPE,
+        validity,
+        None,
+        bit_width,
+        len,
+        0,
+    )
+}
+
 pub fn gather_patches(
     parray: &PrimitiveArray,
     bit_width: u8,
@@ -650,4 +787,36 @@ mod test {
         assert_arrays_eq!(chunk_offsets, PrimitiveArray::from_iter([0u64]));
         Ok(())
     }
+
+    #[rstest::rstest]
+    #[case::aligned_1024(1024u32, 7, 5)]
+    #[case::aligned_multi(8192u32, 7, 5)]
+    #[case::partial_tail(2050u32, 7, 5)]
+    #[case::small(13u32, 5, 17)]
+    #[case::large_bitwidth(1_000_000u32, 18, 200_000)]
+    fn bitpack_constant_matches_full_encode(
+        #[case] len: u32,
+        #[case] bit_width: u8,
+        #[case] constant: u32,
+    ) -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        let input = PrimitiveArray::from_iter(std::iter::repeat_n(constant, len as usize));
+
+        let slow = bitpack_encode(&input, bit_width, None, &mut ctx)?;
+        let fast = bitpack_encode_constant::<u32>(
+            constant,
+            bit_width,
+            len as usize,
+            Validity::NonNullable,
+        )?;
+
+        let slow_packed = slow.packed().clone().unwrap_host();
+        let fast_packed = fast.packed().clone().unwrap_host();
+        assert_eq!(slow_packed.as_slice(), fast_packed.as_slice());
+
+        // Unpack fast result and verify roundtrip.
+        let unpacked = fast.into_array().execute::<PrimitiveArray>(&mut ctx)?;
+        assert_arrays_eq!(unpacked, input);
+        Ok(())
+    }
 }
diff --git a/encodings/fastlanes/src/bitpacking/array/mod.rs b/encodings/fastlanes/src/bitpacking/array/mod.rs
@@ -273,6 +273,29 @@ impl BitPackedData {
     pub fn max_packed_value(&self) -> usize {
         (1 << self.bit_width()) - 1
     }
+
+    /// Test whether `value` can be represented as a packed lane in this array, i.e. whether
+    /// it falls in the range `[0, 2^bit_width - 1]`.
+    ///
+    /// This is an `O(1)` check that never inspects the packed buffer and is strictly cheaper
+    /// than encoding `value` into the bit-packed representation. It is the building block for
+    /// fast comparison kernels that can short-circuit when the constant cannot match any
+    /// packed lane.
+    ///
+    /// Returns `None` if `value` cannot be losslessly converted to `i128` (which never
+    /// happens for the integer types supported by bit-packing).
+    #[inline]
+    pub fn value_fits_bit_width<T: NativePType + num_traits::ToPrimitive>(
+        &self,
+        value: T,
+    ) -> Option<bool> {
+        let v = value.to_i128()?;
+        if v < 0 {
+            return Some(false);
+        }
+        let max = (1i128 << self.bit_width()) - 1;
+        Some(v <= max)
+    }
 }
 
 pub trait BitPackedArrayExt: BitPackedArraySlotsExt {