Skip to content
Closed
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ roaring = "0.11.0"
rstest = "0.26.1"
rstest_reuse = "0.7.0"
rustc-hash = "2.1"
seq-macro = "0.3.6"
serde = "1.0.220"
serde_json = "1.0.138"
serde_test = "1.0.176"
Expand Down
6 changes: 6 additions & 0 deletions encodings/fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ fastlanes = { workspace = true }
itertools = { workspace = true }
lending-iterator = { workspace = true }
num-traits = { workspace = true }
paste = { workspace = true }
prost = { workspace = true }
rand = { workspace = true, optional = true }
seq-macro = { workspace = true }
vortex-array = { workspace = true }
vortex-buffer = { workspace = true }
vortex-error = { workspace = true }
Expand Down Expand Up @@ -60,6 +62,10 @@ name = "bit_transpose"
harness = false
required-features = ["_test-harness"]

[[bench]]
name = "bitpack_constant"
harness = false

[[bench]]
name = "bitpack_compare"
harness = false
Expand Down
47 changes: 47 additions & 0 deletions encodings/fastlanes/benches/bitpack_compare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,53 @@ fn build_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx)
(array, rhs, ctx)
}

/// Build the same packed array but with an *in-range* constant RHS, so the streaming /
/// fused unpack-compare path runs (the out-of-range fast path does not apply).
fn build_in_range_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let buf: BufferMut<u32> = (0..len).map(|i| (i as u32) % (1 << BW)).collect();
let array = BitPackedData::encode(
&PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(),
BW,
&mut ctx,
)
.unwrap()
.into_array();
// Mid-range constant: inside [0, 2^BW - 1], so every lane must actually be inspected.
let constant = (1u32 << BW) / 2;
let rhs = ConstantArray::new(constant, len).into_array();
(array, rhs, ctx)
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn in_range_eq<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_in_range_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
array
.clone()
.binary(rhs.clone(), Operator::Eq)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn in_range_eq_baseline<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_in_range_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
// What the fallback would do: materialize the unpacked primitive, then run Arrow
// compare on it.
let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
primitive
.into_array()
.binary(rhs.clone(), Operator::Eq)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn fast_eq_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_inputs::<BW>(len);
Expand Down
53 changes: 53 additions & 0 deletions encodings/fastlanes/benches/bitpack_constant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Compare the fast constant bit-packing path against the standard `bitpack_encode`
//! pipeline on a uniform-constant input.
//!
//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_constant`.

#![expect(clippy::unwrap_used)]

use divan::Bencher;
use divan::black_box;
use divan::counter::ItemsCount;
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::BufferMut;
use vortex_fastlanes::bitpack_compress::bitpack_encode;
use vortex_fastlanes::bitpack_compress::bitpack_encode_constant;

fn main() {
divan::main();
}

const LENS: &[usize] = &[1024, 64 * 1024];
const BIT_WIDTHS: &[u8] = &[4, 16];

const CONSTANT: u32 = 7;

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn full_encode<const BW: u8>(bencher: Bencher, len: usize) {
let buf: BufferMut<u32> = (0..len).map(|_| CONSTANT).collect();
let arr = PrimitiveArray::new(buf.freeze(), Validity::NonNullable);
let mut ctx = LEGACY_SESSION.create_execution_ctx();

bencher
.counter(ItemsCount::new(len))
.bench_local(|| bitpack_encode(black_box(&arr), black_box(BW), None, &mut ctx).unwrap());
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn fast_encode<const BW: u8>(bencher: Bencher, len: usize) {
bencher.counter(ItemsCount::new(len)).bench_local(|| {
bitpack_encode_constant::<u32>(
black_box(CONSTANT),
black_box(BW),
black_box(len),
Validity::NonNullable,
)
.unwrap()
});
}
169 changes: 169 additions & 0 deletions encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,143 @@ pub fn bitpack_primitive<T: NativePType + BitPacking>(array: &[T], bit_width: u8
output.freeze()
}

/// Build the bit-packed buffer for a `[constant; len]` input without calling the
/// SIMD packer.
///
/// The FastLanes packing kernel runs `LANES` independent lane packers in parallel, each
/// consuming `T = 8 * size_of::<T>()` input values and producing `bit_width` output words.
/// When every input value equals `constant`, all `LANES` lane packers produce the same
/// `bit_width` words. We compute those words analytically — looping over `T` bits per
/// output word with a single `OR`/shift — then replicate the lane pattern across the
/// chunk and the chunk pattern across the buffer with `memset`/`memcpy`. No call to
/// `BitPacking::pack` is involved for any full chunk.
///
/// The trailing partial chunk (when `len % 1024 != 0`) is zero-padded past `len`, so it
/// has a different pattern than the full template. It is built by re-using the analytical
/// kernel only when `len % 1024` is itself a multiple of `T` (so the padded boundary
/// aligns with a lane row); otherwise we fall back to a single `unchecked_pack` call for
/// that final chunk only.
///
/// # Preconditions
///
/// * `constant` must fit in `bit_width`, i.e., `(constant as u64) < (1 << bit_width)`.
/// * `0 < bit_width <= size_of::<T>() * 8`.
pub fn bitpack_constant<T: NativePType + BitPacking>(
constant: T,
bit_width: u8,
len: usize,
) -> Buffer<T> {
if bit_width == 0 || len == 0 {
return Buffer::<T>::empty();
}
let w = bit_width as usize;
let t_bits = 8 * size_of::<T>();
let lanes = 1024 / t_bits;
let packed_len = 128 * w / size_of::<T>();
debug_assert_eq!(packed_len, w * lanes);

let num_chunks = len.div_ceil(1024);
let num_full_chunks = len / 1024;

let mut output = BufferMut::<T>::with_capacity(num_chunks * packed_len);

if num_full_chunks > 0 {
// One full chunk's bit pattern: `w` distinct output words, each replicated `lanes`
// times. Build the template on the stack with `lane_word`-sized `memset`s, then
// `memcpy` it into the output for every full chunk.
let lane_words = constant_lane_words::<T>(constant, w);
let mut chunk: [T; 1024] = [T::zero(); 1024];
for (k, &word) in lane_words.iter().enumerate() {
chunk[k * lanes..(k + 1) * lanes].fill(word);
}
let template = &chunk[..packed_len];
for _ in 0..num_full_chunks {
output.extend_from_slice(template);
}
}

if num_chunks > num_full_chunks {
// Tail chunk gets zero-padded past `len % 1024`, so it differs from the full
// template. Use the standard packer for this single chunk.
let last_chunk_size = len % 1024;
let mut last_chunk: [T; 1024] = [T::zero(); 1024];
last_chunk[..last_chunk_size].fill(constant);
let tail_start = output.len();
unsafe {
output.set_len(tail_start + packed_len);
BitPacking::unchecked_pack(w, &last_chunk, &mut output[tail_start..][..packed_len]);
}
}

output.freeze()
}

/// Compute the `bit_width` output words that every FastLanes lane produces when packing
/// `T = 8 * size_of::<T>()` copies of `constant`.
///
/// For constant input, each lane packs a periodic bit-stream of period `bit_width` made
/// of the low `bit_width` bits of `constant`. Output word `k` contains bits
/// `[k * T, (k + 1) * T)` of that stream, so its `j`-th bit equals bit
/// `(k * T + j) mod bit_width` of `constant`.
fn constant_lane_words<T: NativePType + BitPacking>(constant: T, bit_width: usize) -> Vec<T> {
let t_bits = 8 * size_of::<T>();
let mask = if bit_width == t_bits {
!T::zero()
} else {
(T::one() << bit_width) - T::one()
};
let s = constant & mask;
(0..bit_width)
.map(|k| {
let mut word = T::zero();
for j in 0..t_bits {
let bit_in_s = (k * t_bits + j) % bit_width;
let bit = (s >> bit_in_s) & T::one();
word = word | (bit << j);
}
word
})
.collect()
}

/// Encode a length-`len` array of `constant` values as a [`BitPackedArray`] without
/// running the standard encode pipeline.
///
/// Returns an error if `constant` does not fit in `bit_width`, or if `bit_width` is too
/// large for `T`.
pub fn bitpack_encode_constant<T: NativePType + BitPacking + num_traits::ToPrimitive>(
constant: T,
bit_width: u8,
len: usize,
validity: Validity,
) -> VortexResult<BitPackedArray> {
if bit_width as usize >= T::PTYPE.bit_width() {
vortex_bail!(
InvalidArgument: "Cannot pack - specified bit width {bit_width} >= {}",
T::PTYPE.bit_width()
);
}
let c = constant
.to_i128()
.ok_or_else(|| vortex_error::vortex_err!("cannot cast constant to i128"))?;
if c < 0 || c > (1i128 << bit_width) - 1 {
vortex_bail!(
InvalidArgument: "constant {c} does not fit in bit_width {bit_width}"
);
}

let packed = bitpack_constant(constant, bit_width, len).into_byte_buffer();
BitPacked::try_new(
BufferHandle::new_host(packed),
T::PTYPE,
validity,
None,
bit_width,
len,
0,
)
}

pub fn gather_patches(
parray: &PrimitiveArray,
bit_width: u8,
Expand Down Expand Up @@ -650,4 +787,36 @@ mod test {
assert_arrays_eq!(chunk_offsets, PrimitiveArray::from_iter([0u64]));
Ok(())
}

#[rstest::rstest]
#[case::aligned_1024(1024u32, 7, 5)]
#[case::aligned_multi(8192u32, 7, 5)]
#[case::partial_tail(2050u32, 7, 5)]
#[case::small(13u32, 5, 17)]
#[case::large_bitwidth(1_000_000u32, 18, 200_000)]
fn bitpack_constant_matches_full_encode(
#[case] len: u32,
#[case] bit_width: u8,
#[case] constant: u32,
) -> VortexResult<()> {
let mut ctx = SESSION.create_execution_ctx();
let input = PrimitiveArray::from_iter(std::iter::repeat_n(constant, len as usize));

let slow = bitpack_encode(&input, bit_width, None, &mut ctx)?;
let fast = bitpack_encode_constant::<u32>(
constant,
bit_width,
len as usize,
Validity::NonNullable,
)?;

let slow_packed = slow.packed().clone().unwrap_host();
let fast_packed = fast.packed().clone().unwrap_host();
assert_eq!(slow_packed.as_slice(), fast_packed.as_slice());

// Unpack fast result and verify roundtrip.
let unpacked = fast.into_array().execute::<PrimitiveArray>(&mut ctx)?;
assert_arrays_eq!(unpacked, input);
Ok(())
}
}
23 changes: 23 additions & 0 deletions encodings/fastlanes/src/bitpacking/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,29 @@ impl BitPackedData {
pub fn max_packed_value(&self) -> usize {
(1 << self.bit_width()) - 1
}

/// Test whether `value` can be represented as a packed lane in this array, i.e. whether
/// it falls in the range `[0, 2^bit_width - 1]`.
///
/// This is an `O(1)` check that never inspects the packed buffer and is strictly cheaper
/// than encoding `value` into the bit-packed representation. It is the building block for
/// fast comparison kernels that can short-circuit when the constant cannot match any
/// packed lane.
///
/// Returns `None` if `value` cannot be losslessly converted to `i128` (which never
/// happens for the integer types supported by bit-packing).
#[inline]
pub fn value_fits_bit_width<T: NativePType + num_traits::ToPrimitive>(
&self,
value: T,
) -> Option<bool> {
let v = value.to_i128()?;
if v < 0 {
return Some(false);
}
let max = (1i128 << self.bit_width()) - 1;
Some(v <= max)
}
}

pub trait BitPackedArrayExt: BitPackedArraySlotsExt {
Expand Down
Loading
Loading