From 2d5e22b7926f951717cf4de7cfb25bcdae5da686 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 23 Apr 2026 19:18:51 -0400 Subject: [PATCH 01/16] feat: use cardinality estimator for distinct count stats Replace the exact `HashMap`/`HashSet` previously used to compute distinct-value counts during compression stats generation with Cloudflare's `cardinality-estimator` crate. The estimator gives us a bounded-memory approximation (exact up to ~128 distinct values, then HyperLogLog++) so high-cardinality arrays no longer require an O(n) auxiliary hash table to answer the single question "how many unique values does this have?". - Integer stats swap the hash map for a `CardinalityEstimator` and track the most frequent value via a Boyer-Moore majority candidate plus a second-pass exact count. Sparse/dict schemes only care about the heavy hitter (>= 90% threshold) or a rough distinct ratio, so this is behaviourally equivalent for the decisions they make. - Float and string stats likewise drop their hash sets in favor of the estimator. - The integer and float dictionary encoders now rebuild the exact set of distinct values from the source array at compress time, since they need the values themselves and the stats layer no longer retains them. - `SequenceScheme`'s fast-path check for "all values are distinct" now tolerates the estimator's small approximation error; the deferred callback still validates sequences exactly. Signed-off-by: Robert Kruszewski --- Cargo.lock | 33 +++ Cargo.toml | 2 + vortex-array/benches/dict_compare.rs | 18 +- vortex-array/benches/dict_compress.rs | 47 ++-- vortex-array/src/arrays/dict/array.rs | 20 +- vortex-array/src/arrays/dict/compute/cast.rs | 42 ++- vortex-array/src/arrays/dict/compute/mod.rs | 34 ++- .../src/arrays/primitive/array/mod.rs | 8 +- vortex-array/src/arrays/varbin/vtable/mod.rs | 1 - vortex-array/src/builders/dict/bytes.rs | 253 ++++++++++++------ vortex-array/src/builders/dict/mod.rs | 5 +- vortex-array/src/builders/dict/primitive.rs | 92 ++++--- vortex-array/src/scalar/arrow.rs | 4 +- vortex-buffer/src/trusted_len.rs | 6 +- vortex-compressor/Cargo.toml | 9 + vortex-compressor/benches/dict_encode.rs | 12 - .../src/builtins/constant/binary.rs | 2 +- .../src/builtins/constant/string.rs | 7 +- vortex-compressor/src/builtins/dict/float.rs | 131 +-------- .../src/builtins/dict/integer.rs | 144 +--------- vortex-compressor/src/builtins/dict/mod.rs | 2 - vortex-compressor/src/builtins/mod.rs | 2 - vortex-compressor/src/stats/bool.rs | 15 +- vortex-compressor/src/stats/cardinality.rs | 146 ++++++++++ vortex-compressor/src/stats/float.rs | 59 ++-- vortex-compressor/src/stats/integer.rs | 170 +++++++++--- vortex-compressor/src/stats/mod.rs | 1 + vortex-compressor/src/stats/varbinview.rs | 95 +++++-- vortex-ffi/src/data_source.rs | 2 +- vortex-layout/src/layouts/dict/writer.rs | 6 +- vortex/benches/single_encoding_throughput.rs | 10 +- 31 files changed, 775 insertions(+), 603 deletions(-) create mode 100644 vortex-compressor/src/stats/cardinality.rs diff --git a/Cargo.lock b/Cargo.lock index c7c50f5047e..c83f5c50463 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1036,6 +1036,16 @@ dependencies = [ "serde_core", ] +[[package]] +name = "cardinality-estimator" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a7bc4e9a7ab9239a4a46df35d75101cee57defc8e4255a1b2b6f8eab8de87e" +dependencies = [ + "enum_dispatch", + "wyhash", +] + [[package]] name = "cargo-platform" version = "0.3.3" @@ -3591,6 +3601,18 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "enum_dispatch" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.118", +] + [[package]] name = "env_filter" version = "1.0.1" @@ -9704,7 +9726,9 @@ dependencies = [ name = "vortex-compressor" version = "0.1.0" dependencies = [ + "cardinality-estimator", "codspeed-divan-compat", + "hyperloglogplus", "itertools 0.14.0", "num-traits", "parking_lot", @@ -11102,6 +11126,15 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "wyhash" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf6e163c25e3fac820b4b453185ea2dea3b6a3e0a721d4d23d75bd33734c295" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "wyz" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index 0a1cd5731e9..36c9b76713b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -119,6 +119,7 @@ bit-vec = "0.9.0" bitvec = "1.0.1" bytes = "1.11.1" bzip2 = "0.6.0" +cardinality-estimator = "1.0.3" cargo_metadata = "0.23.1" cbindgen = "0.29.0" cc = "1.2" @@ -168,6 +169,7 @@ goldenfile = "1" half = { version = "2.7.1", features = ["std", "num-traits"] } hashbrown = "0.17.1" humansize = "2.1.3" +hyperloglogplus = "0.4.1" indicatif = "0.18.0" insta = "1.43" inventory = "0.3.20" diff --git a/vortex-array/benches/dict_compare.rs b/vortex-array/benches/dict_compare.rs index 69d9d3ddf4b..5ef450173f5 100644 --- a/vortex-array/benches/dict_compare.rs +++ b/vortex-array/benches/dict_compare.rs @@ -48,6 +48,9 @@ const LENGTH_AND_UNIQUE_VALUES: &[(usize, usize)] = &[ (100_000, 2048), ]; +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + #[divan::bench(args = LENGTH_AND_UNIQUE_VALUES)] fn bench_compare_primitive(bencher: divan::Bencher, (len, uniqueness): (usize, usize)) { let primitive_arr = gen_primitive_for_dict::(len, uniqueness); @@ -57,10 +60,9 @@ fn bench_compare_primitive(bencher: divan::Bencher, (len, uniqueness): (usize, u ) .unwrap(); let value = primitive_arr.as_slice::()[0]; - let session = vortex_array::array_session(); bencher - .with_inputs(|| (&dict, session.create_execution_ctx())) + .with_inputs(|| (&dict, SESSION.create_execution_ctx())) .bench_refs(|(dict, ctx)| { dict.clone() .into_array() @@ -81,10 +83,9 @@ fn bench_compare_varbin(bencher: divan::Bencher, (len, uniqueness): (usize, usiz .unwrap(); let bytes = varbin_arr.with_iterator(|i| i.next().unwrap().unwrap().to_vec()); let value = from_utf8(bytes.as_slice()).unwrap(); - let session = vortex_array::array_session(); bencher - .with_inputs(|| (&dict, session.create_execution_ctx())) + .with_inputs(|| (&dict, SESSION.create_execution_ctx())) .bench_refs(|(dict, ctx)| { dict.clone() .into_array() @@ -105,10 +106,9 @@ fn bench_compare_varbinview(bencher: divan::Bencher, (len, uniqueness): (usize, .unwrap(); let bytes = varbinview_arr.with_iterator(|i| i.next().unwrap().unwrap().to_vec()); let value = from_utf8(bytes.as_slice()).unwrap(); - let session = vortex_array::array_session(); bencher - .with_inputs(|| (&dict, session.create_execution_ctx())) + .with_inputs(|| (&dict, SESSION.create_execution_ctx())) .bench_refs(|(dict, ctx)| { dict.clone() .into_array() @@ -144,10 +144,9 @@ fn bench_compare_sliced_dict_primitive( .unwrap(); let dict = dict.into_array().slice(0..codes_len).unwrap(); let value = primitive_arr.as_slice::()[0]; - let session = vortex_array::array_session(); bencher - .with_inputs(|| (&dict, session.create_execution_ctx())) + .with_inputs(|| (&dict, SESSION.create_execution_ctx())) .bench_refs(|(dict, ctx)| { dict.clone() .apply(&eq(root(), lit(value))) @@ -171,10 +170,9 @@ fn bench_compare_sliced_dict_varbinview( let dict = dict.into_array().slice(0..codes_len).unwrap(); let bytes = varbin_arr.with_iterator(|i| i.next().unwrap().unwrap().to_vec()); let value = from_utf8(bytes.as_slice()).unwrap(); - let session = vortex_array::array_session(); bencher - .with_inputs(|| (&dict, session.create_execution_ctx())) + .with_inputs(|| (&dict, SESSION.create_execution_ctx())) .bench_refs(|(dict, ctx)| { dict.clone() .apply(&eq(root(), lit(value))) diff --git a/vortex-array/benches/dict_compress.rs b/vortex-array/benches/dict_compress.rs index 3bd287414c0..2901d82c6b5 100644 --- a/vortex-array/benches/dict_compress.rs +++ b/vortex-array/benches/dict_compress.rs @@ -45,29 +45,30 @@ where T: NativePType, StandardUniform: Distribution, { - let primitive_arr = gen_primitive_for_dict::(len, unique_values); + let primitive_arr = gen_primitive_for_dict::(len, unique_values).into_array(); bencher .with_inputs(|| (&primitive_arr, SESSION.create_execution_ctx())) - .bench_refs(|(arr, ctx)| dict_encode(&arr.clone().into_array(), ctx)); + .bench_refs(|(arr, ctx)| dict_encode(arr, ctx)); } #[divan::bench(args = BENCH_ARGS)] fn encode_varbin(bencher: Bencher, (len, unique_values): (usize, usize)) { - let varbin_arr = VarBinArray::from(gen_varbin_words(len, unique_values)); + let varbin_arr = VarBinArray::from(gen_varbin_words(len, unique_values)).into_array(); bencher .with_inputs(|| (&varbin_arr, SESSION.create_execution_ctx())) - .bench_refs(|(arr, ctx)| dict_encode(&arr.clone().into_array(), ctx)); + .bench_refs(|(arr, ctx)| dict_encode(arr, ctx)); } #[divan::bench(args = BENCH_ARGS)] fn encode_varbinview(bencher: Bencher, (len, unique_values): (usize, usize)) { - let varbinview_arr = VarBinViewArray::from_iter_str(gen_varbin_words(len, unique_values)); + let varbinview_arr = + VarBinViewArray::from_iter_str(gen_varbin_words(len, unique_values)).into_array(); bencher .with_inputs(|| (&varbinview_arr, SESSION.create_execution_ctx())) - .bench_refs(|(arr, ctx)| dict_encode(&arr.clone().into_array(), ctx)); + .bench_refs(|(arr, ctx)| dict_encode(arr, ctx)); } #[divan::bench(types = [u8, f32, i64], args = BENCH_ARGS)] @@ -76,13 +77,10 @@ where T: NativePType, StandardUniform: Distribution, { - let primitive_arr = gen_primitive_for_dict::(len, unique_values); - let dict = dict_encode( - &primitive_arr.into_array(), - &mut SESSION.create_execution_ctx(), - ) - .unwrap() - .into_array(); + let primitive_arr = gen_primitive_for_dict::(len, unique_values).into_array(); + let dict = dict_encode(&primitive_arr, &mut SESSION.create_execution_ctx()) + .unwrap() + .into_array(); bencher .with_inputs(|| (&dict, SESSION.create_execution_ctx())) @@ -91,13 +89,10 @@ where #[divan::bench(args = BENCH_ARGS)] fn decode_varbin(bencher: Bencher, (len, unique_values): (usize, usize)) { - let varbin_arr = VarBinArray::from(gen_varbin_words(len, unique_values)); - let dict = dict_encode( - &varbin_arr.into_array(), - &mut SESSION.create_execution_ctx(), - ) - .unwrap() - .into_array(); + let varbin_arr = VarBinArray::from(gen_varbin_words(len, unique_values)).into_array(); + let dict = dict_encode(&varbin_arr, &mut SESSION.create_execution_ctx()) + .unwrap() + .into_array(); bencher .with_inputs(|| (&dict, SESSION.create_execution_ctx())) @@ -106,13 +101,11 @@ fn decode_varbin(bencher: Bencher, (len, unique_values): (usize, usize)) { #[divan::bench(args = BENCH_ARGS)] fn decode_varbinview(bencher: Bencher, (len, unique_values): (usize, usize)) { - let varbinview_arr = VarBinViewArray::from_iter_str(gen_varbin_words(len, unique_values)); - let dict = dict_encode( - &varbinview_arr.into_array(), - &mut SESSION.create_execution_ctx(), - ) - .unwrap() - .into_array(); + let varbinview_arr = + VarBinViewArray::from_iter_str(gen_varbin_words(len, unique_values)).into_array(); + let dict = dict_encode(&varbinview_arr, &mut SESSION.create_execution_ctx()) + .unwrap() + .into_array(); bencher .with_inputs(|| (&dict, SESSION.create_execution_ctx())) diff --git a/vortex-array/src/arrays/dict/array.rs b/vortex-array/src/arrays/dict/array.rs index de6b765d8b0..f73f4965439 100644 --- a/vortex-array/src/arrays/dict/array.rs +++ b/vortex-array/src/arrays/dict/array.rs @@ -4,6 +4,7 @@ use std::fmt::Display; use std::fmt::Formatter; +use num_traits::AsPrimitive; use smallvec::smallvec; use vortex_buffer::BitBuffer; use vortex_error::VortexExpect; @@ -159,13 +160,9 @@ pub trait DictArrayExt: TypedArrayRef + DictArraySlotsExt { match codes_validity.bit_buffer() { AllOr::All => { match_each_integer_ptype!(codes_primitive.ptype(), |P| { - #[allow( - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" - )] - for &idx in codes_primitive.as_slice::

() { - values_vec[idx as usize] = referenced_value; + for idx in codes_primitive.as_slice::

() { + let idxu: usize = idx.as_(); + values_vec[idxu] = referenced_value; } }); } @@ -173,14 +170,9 @@ pub trait DictArrayExt: TypedArrayRef + DictArraySlotsExt { AllOr::Some(mask) => { match_each_integer_ptype!(codes_primitive.ptype(), |P| { let codes = codes_primitive.as_slice::

(); - - #[allow( - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" - )] mask.set_indices().for_each(|idx| { - values_vec[codes[idx] as usize] = referenced_value; + let idxu: usize = codes[idx].as_(); + values_vec[idxu] = referenced_value; }); }); } diff --git a/vortex-array/src/arrays/dict/compute/cast.rs b/vortex-array/src/arrays/dict/compute/cast.rs index c4438891176..215ebddaab6 100644 --- a/vortex-array/src/arrays/dict/compute/cast.rs +++ b/vortex-array/src/arrays/dict/compute/cast.rs @@ -52,9 +52,8 @@ mod tests { use vortex_buffer::buffer; use vortex_session::VortexSession; + use crate::ArrayRef; use crate::IntoArray; - #[expect(deprecated)] - use crate::ToCanonical as _; use crate::VortexSessionExecute; use crate::arrays::Dict; use crate::arrays::PrimitiveArray; @@ -71,8 +70,9 @@ mod tests { #[test] fn test_cast_dict_to_wider_type() { + let ctx = &mut SESSION.create_execution_ctx(); let values = buffer![1i32, 2, 3, 2, 1].into_array(); - let dict = dict_encode(&values, &mut SESSION.create_execution_ctx()).unwrap(); + let dict = dict_encode(&values, ctx).unwrap(); let casted = dict .into_array() @@ -83,13 +83,8 @@ mod tests { &DType::Primitive(PType::I64, Nullability::NonNullable) ); - #[expect(deprecated)] - let decoded = casted.to_primitive(); - assert_arrays_eq!( - decoded, - PrimitiveArray::from_iter([1i64, 2, 3, 2, 1]), - &mut SESSION.create_execution_ctx() - ); + let decoded = casted.into_array().execute::(ctx).unwrap(); + assert_arrays_eq!(decoded, PrimitiveArray::from_iter([1i64, 2, 3, 2, 1]), ctx); } #[test] @@ -110,9 +105,10 @@ mod tests { #[test] fn test_cast_dict_allvalid_to_nonnullable_and_back() { + let ctx = &mut SESSION.create_execution_ctx(); // Create an AllValid dict array (no nulls) let values = buffer![10i32, 20, 30, 40].into_array(); - let dict = dict_encode(&values, &mut SESSION.create_execution_ctx()).unwrap(); + let dict = dict_encode(&values, ctx).unwrap(); // Verify initial state - codes should be NonNullable, values should be NonNullable assert_eq!(dict.codes().dtype().nullability(), Nullability::NonNullable); @@ -173,15 +169,10 @@ mod tests { ); // Verify values are unchanged - #[expect(deprecated)] - let original_values = dict.as_array().to_primitive(); - #[expect(deprecated)] - let final_values = back_to_non_nullable.to_primitive(); - assert_arrays_eq!( - original_values, - final_values, - &mut SESSION.create_execution_ctx() - ); + let original_values = dict.into_array().execute::(ctx).unwrap(); + + let final_values = back_to_non_nullable.execute::(ctx).unwrap(); + assert_arrays_eq!(original_values, final_values, ctx); } #[rstest] @@ -189,15 +180,13 @@ mod tests { #[case(dict_encode(&buffer![100u32, 200, 100, 300, 200].into_array(), &mut SESSION.create_execution_ctx()).unwrap().into_array())] #[case(dict_encode(&PrimitiveArray::from_option_iter([Some(1i32), None, Some(2), Some(1), None]).into_array(), &mut SESSION.create_execution_ctx()).unwrap().into_array())] #[case(dict_encode(&buffer![1.5f32, 2.5, 1.5, 3.5].into_array(), &mut SESSION.create_execution_ctx()).unwrap().into_array())] - fn test_cast_dict_conformance(#[case] array: crate::ArrayRef) { + fn test_cast_dict_conformance(#[case] array: ArrayRef) { test_cast_conformance(&array); } #[test] fn test_cast_dict_with_unreferenced_null_values_to_nonnullable() { - use crate::arrays::DictArray; - use crate::validity::Validity; - + let ctx = &mut SESSION.create_execution_ctx(); // Create a dict with nullable values that have unreferenced null entries. // Values: [1.0, null, 3.0] (index 1 is null but no code points to it) // Codes: [0, 2, 0] (only reference indices 0 and 2, never 1) @@ -228,12 +217,11 @@ mod tests { casted.dtype(), &DType::Primitive(PType::F64, Nullability::NonNullable) ); - #[expect(deprecated)] - let casted_prim = casted.to_primitive(); + let casted_prim = casted.execute::(ctx).unwrap(); assert_arrays_eq!( casted_prim, PrimitiveArray::from_iter([1.0f64, 3.0, 1.0]), - &mut SESSION.create_execution_ctx() + ctx ); } } diff --git a/vortex-array/src/arrays/dict/compute/mod.rs b/vortex-array/src/arrays/dict/compute/mod.rs index 7ba4edd18f1..defa44a0976 100644 --- a/vortex-array/src/arrays/dict/compute/mod.rs +++ b/vortex-array/src/arrays/dict/compute/mod.rs @@ -56,15 +56,11 @@ impl FilterReduce for Dict { mod test { use std::sync::LazyLock; - #[expect(unused_imports)] - use itertools::Itertools; use vortex_buffer::buffer; use vortex_session::VortexSession; use crate::ArrayRef; use crate::IntoArray; - #[expect(deprecated)] - use crate::ToCanonical as _; use crate::VortexSessionExecute; use crate::accessor::ArrayAccessor; use crate::arrays::ConstantArray; @@ -101,8 +97,10 @@ mod test { &mut SESSION.create_execution_ctx(), ) .unwrap(); - #[expect(deprecated)] - let actual = dict.as_array().to_primitive(); + let actual = dict + .into_array() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); let expected = PrimitiveArray::from_option_iter(values); @@ -119,9 +117,12 @@ mod test { &expected.clone().into_array(), &mut SESSION.create_execution_ctx(), ) - .unwrap(); - #[expect(deprecated)] - let actual = dict.as_array().to_primitive(); + .unwrap() + .into_array(); + + let actual = dict + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_arrays_eq!(actual, expected, &mut ctx); } @@ -136,9 +137,12 @@ mod test { &expected.clone().into_array(), &mut SESSION.create_execution_ctx(), ) - .unwrap(); - #[expect(deprecated)] - let actual = dict.as_array().to_primitive(); + .unwrap() + .into_array(); + + let actual = dict + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_arrays_eq!(actual, expected, &mut ctx); } @@ -155,8 +159,10 @@ mod test { &mut SESSION.create_execution_ctx(), ) .unwrap(); - #[expect(deprecated)] - let flattened_dict = dict.as_array().to_varbinview(); + let flattened_dict = dict + .into_array() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_eq!( flattened_dict.with_iterator(|iter| iter .map(|slice| slice.map(|s| s.to_vec())) diff --git a/vortex-array/src/arrays/primitive/array/mod.rs b/vortex-array/src/arrays/primitive/array/mod.rs index 98bd993c7f9..65ef3ca0490 100644 --- a/vortex-array/src/arrays/primitive/array/mod.rs +++ b/vortex-array/src/arrays/primitive/array/mod.rs @@ -488,18 +488,18 @@ impl Array { let buffer = match &validity { Validity::NonNullable | Validity::AllValid => { - BufferMut::::from_iter(buf_iter.zip(iter::repeat(true)).map(f)) + Buffer::::from_trusted_len_iter(buf_iter.zip(iter::repeat(true)).map(f)) } Validity::AllInvalid => { - BufferMut::::from_iter(buf_iter.zip(iter::repeat(false)).map(f)) + Buffer::::from_trusted_len_iter(buf_iter.zip(iter::repeat(false)).map(f)) } Validity::Array(val) => { #[expect(deprecated)] let val = val.to_bool().into_bit_buffer(); - BufferMut::::from_iter(buf_iter.zip(val.iter()).map(f)) + Buffer::::from_trusted_len_iter(buf_iter.zip(val.iter()).map(f)) } }; - Ok(PrimitiveArray::new(buffer.freeze(), validity)) + Ok(PrimitiveArray::new(buffer, validity)) } } diff --git a/vortex-array/src/arrays/varbin/vtable/mod.rs b/vortex-array/src/arrays/varbin/vtable/mod.rs index 71003aa9cad..90dcf320427 100644 --- a/vortex-array/src/arrays/varbin/vtable/mod.rs +++ b/vortex-array/src/arrays/varbin/vtable/mod.rs @@ -142,7 +142,6 @@ impl VTable for VarBin { dtype: &DType, len: usize, metadata: &[u8], - buffers: &[BufferHandle], children: &dyn ArrayChildren, _session: &VortexSession, diff --git a/vortex-array/src/builders/dict/bytes.rs b/vortex-array/src/builders/dict/bytes.rs index 65c49fa0173..3a045b5c21c 100644 --- a/vortex-array/src/builders/dict/bytes.rs +++ b/vortex-array/src/builders/dict/bytes.rs @@ -1,15 +1,22 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::cell::OnceCell; use std::hash::BuildHasher; use std::mem; use std::sync::Arc; +use itertools::Itertools; +use num_traits::AsPrimitive; +use vortex_array::ExecutionCtx; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; use vortex_buffer::ByteBufferMut; use vortex_error::VortexExpect; +use vortex_error::VortexResult; use vortex_error::vortex_panic; +use vortex_mask::AllOr; +use vortex_mask::Mask; use vortex_utils::aliases::hash_map::DefaultHashBuilder; use vortex_utils::aliases::hash_map::HashTable; use vortex_utils::aliases::hash_map::HashTableEntry; @@ -18,23 +25,24 @@ use vortex_utils::aliases::hash_map::RandomState; use super::DictConstraints; use super::DictEncoder; use crate::ArrayRef; +use crate::ArrayView; use crate::IntoArray; -use crate::accessor::ArrayAccessor; use crate::arrays::PrimitiveArray; use crate::arrays::VarBin; use crate::arrays::VarBinView; use crate::arrays::VarBinViewArray; +use crate::arrays::varbin::VarBinArrayExt; use crate::arrays::varbinview::build_views::BinaryView; -#[expect(deprecated)] -use crate::canonical::ToCanonical as _; use crate::dtype::DType; use crate::dtype::PType; use crate::dtype::UnsignedPType; +use crate::match_each_integer_ptype; use crate::validity::Validity; /// Dictionary encode varbin array. Specializes for primitive byte arrays to avoid double copying -pub struct BytesDictBuilder { - lookup: Option>, +pub struct BytesDictBuilder { + lookup: Option>, + null_code: OnceCell, views: BufferMut, values: ByteBufferMut, values_nulls: BitBufferMut, @@ -58,6 +66,7 @@ impl BytesDictBuilder { Self { lookup: Some(HashTable::new()), views: BufferMut::::empty(), + null_code: OnceCell::new(), values: BufferMut::empty(), values_nulls: BitBufferMut::empty(), hasher: DefaultHashBuilder::default(), @@ -71,18 +80,16 @@ impl BytesDictBuilder { self.views.len() * size_of::() + self.values.len() } - fn lookup_bytes(&self, idx: usize) -> Option<&[u8]> { - self.values_nulls.value(idx).then(|| { - let bin_view = &self.views[idx]; - if bin_view.is_inlined() { - bin_view.as_inlined().value() - } else { - &self.values[bin_view.as_view().as_range()] - } - }) + fn lookup_bytes(&self, idx: usize) -> &[u8] { + let bin_view = &self.views[idx]; + if bin_view.is_inlined() { + bin_view.as_inlined().value() + } else { + &self.values[bin_view.as_view().as_range()] + } } - fn encode_value(&mut self, lookup: &mut HashTable, val: Option<&[u8]>) -> Option { + fn encode_value(&mut self, lookup: &mut HashTable, val: &[u8]) -> Option { match lookup.entry( self.hasher.hash_one(val), |idx| val == self.lookup_bytes(idx.as_()), @@ -95,35 +102,25 @@ impl BytesDictBuilder { } let next_code = self.views.len(); - match val { - None => { - // Null value - self.views.push(BinaryView::default()); - self.values_nulls.append_false(); - } - Some(val) => { - let view = BinaryView::make_view( - val, - 0, - u32::try_from(self.values.len()) - .vortex_expect("values length must fit in u32"), - ); - let additional_bytes = if view.is_inlined() { - size_of::() - } else { - size_of::() + val.len() - }; + let view = BinaryView::make_view( + val, + 0, + u32::try_from(self.values.len()).vortex_expect("values length must fit in u32"), + ); + let additional_bytes = if view.is_inlined() { + size_of::() + } else { + size_of::() + val.len() + }; - if self.dict_bytes() + additional_bytes > self.max_dict_bytes { - return None; - } + if self.dict_bytes() + additional_bytes > self.max_dict_bytes { + return None; + } - self.views.push(view); - self.values_nulls.append_true(); - if !view.is_inlined() { - self.values.extend_from_slice(val); - } - } + self.views.push(view); + self.values_nulls.append_true(); + if !view.is_inlined() { + self.values.extend_from_slice(val); } let next_code = Code::from_usize(next_code).unwrap_or_else(|| { @@ -134,29 +131,119 @@ impl BytesDictBuilder { } } - fn encode_bytes>(&mut self, accessor: &A, len: usize) -> ArrayRef { + /// Encode a stream of value bytes against the dictionary, honoring the supplied validity mask. + /// + /// `values` must yield one slice per logical row in input order; the mask is applied here so + /// callers do not need to emit anything for null positions. + fn encode_iter<'a, I>( + &mut self, + len: usize, + validity_mask: Mask, + values: I, + ) -> VortexResult + where + I: Iterator, + { let mut local_lookup = self.lookup.take().vortex_expect("Must have a lookup dict"); let mut codes: BufferMut = BufferMut::with_capacity(len); - accessor.with_iterator(|it| { - for value in it { - let Some(code) = self.encode_value(&mut local_lookup, value) else { - break; - }; - // SAFETY: we reserved capacity in the buffer for `len` elements - unsafe { codes.push_unchecked(code) } + match validity_mask.bit_buffer() { + AllOr::All => { + for value in values { + let Some(code) = self.encode_value(&mut local_lookup, value) else { + break; + }; + // SAFETY: we reserved capacity in the buffer for `len` elements + unsafe { codes.push_unchecked(code) } + } } - }); + AllOr::None => { + self.views.push(BinaryView::default()); + self.values_nulls.append_false(); + unsafe { + codes.push_n_unchecked(Code::from_usize(0).vortex_expect("must fit 0"), len) + } + } + AllOr::Some(b) => { + for (value, valid) in values.zip_eq(b.iter()) { + if !valid { + let code = self.null_code.get_or_init(|| { + let code = self.views.len(); + self.views.push(BinaryView::default()); + self.values_nulls.append_false(); + Code::from_usize(code).unwrap_or_else(|| { + vortex_panic!("{} has to fit into {}", code, Code::PTYPE) + }) + }); + // SAFETY: we reserved capacity in the buffer for `len` elements + unsafe { codes.push_unchecked(*code) } + } else { + let Some(code) = self.encode_value(&mut local_lookup, value) else { + break; + }; + // SAFETY: we reserved capacity in the buffer for `len` elements + unsafe { codes.push_unchecked(code) } + } + } + } + } // Restore lookup dictionary back into the struct self.lookup = Some(local_lookup); - PrimitiveArray::new(codes, Validity::NonNullable).into_array() + Ok(PrimitiveArray::new(codes, Validity::NonNullable)) + } + + fn encode_varbin( + &mut self, + var_bin: ArrayView, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let offsets = var_bin.offsets().clone().execute::(ctx)?; + let bytes = var_bin.bytes(); + let validity_mask = var_bin.validity()?.execute_mask(var_bin.len(), ctx)?; + let len = var_bin.len(); + + match_each_integer_ptype!(offsets.ptype(), |P| { + let slice_offsets = offsets.as_slice::

(); + let values = slice_offsets.windows(2).map(|w| { + let start: usize = w[0].as_(); + let end: usize = w[1].as_(); + &bytes[start..end] + }); + self.encode_iter(len, validity_mask, values) + }) + } + + fn encode_varbinview( + &mut self, + var_bin_view: ArrayView, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let validity_mask = var_bin_view + .validity()? + .execute_mask(var_bin_view.len(), ctx)?; + let len = var_bin_view.len(); + let views = var_bin_view.views(); + let buffers = var_bin_view + .data_buffers() + .iter() + .map(|b| b.as_host()) + .collect::>(); + + let values = views.iter().map(|view| { + if view.is_inlined() { + view.as_inlined().value() + } else { + &buffers[view.as_view().buffer_index as usize][view.as_view().as_range()] + } + }); + self.encode_iter(len, validity_mask, values) } } impl DictEncoder for BytesDictBuilder { - fn encode(&mut self, array: &ArrayRef) -> ArrayRef { + fn encode(&mut self, array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { debug_assert_eq!( &self.dtype, array.dtype(), @@ -165,17 +252,15 @@ impl DictEncoder for BytesDictBuilder { self.dtype ); - let len = array.len(); if let Some(varbinview) = array.as_opt::() { - self.encode_bytes(&varbinview.into_owned(), len) + self.encode_varbinview(varbinview, ctx) } else if let Some(varbin) = array.as_opt::() { - self.encode_bytes(&varbin.into_owned(), len) + self.encode_varbin(varbin, ctx) } else { // NOTE(aduffy): it is very rare that this path would be taken, only e.g. // if we're performing dictionary encoding downstream of some other compression. - #[expect(deprecated)] - let varbinview = array.to_varbinview(); - self.encode_bytes(&varbinview, len) + let vbv_array = array.clone().execute::(ctx)?; + self.encode_varbinview(vbv_array.as_view(), ctx) } } @@ -210,11 +295,11 @@ mod test { use vortex_session::VortexSession; use crate::IntoArray; - #[expect(deprecated)] - use crate::ToCanonical as _; use crate::VortexSessionExecute; use crate::accessor::ArrayAccessor; + use crate::arrays::PrimitiveArray; use crate::arrays::VarBinArray; + use crate::arrays::VarBinViewArray; use crate::arrays::dict::DictArraySlotsExt; use crate::builders::dict::dict_encode; @@ -222,13 +307,19 @@ mod test { #[test] fn encode_varbin() { - let arr = VarBinArray::from(vec!["hello", "world", "hello", "again", "world"]); + let arr = VarBinViewArray::from_iter_str(vec!["hello", "world", "hello", "again", "world"]); let dict = dict_encode(&arr.into_array(), &mut SESSION.create_execution_ctx()).unwrap(); - #[expect(deprecated)] - let codes = dict.codes().to_primitive(); + let codes = dict + .codes() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_eq!(codes.as_slice::(), &[0, 1, 0, 2, 1]); - #[expect(deprecated)] - let values = dict.values().to_varbinview(); + let values = dict + .values() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); values.with_iterator(|iter| { assert_eq!( iter.flatten() @@ -241,7 +332,7 @@ mod test { #[test] fn encode_varbin_nulls() { - let arr: VarBinArray = vec![ + let arr: VarBinViewArray = vec![ Some("hello"), None, Some("world"), @@ -254,11 +345,17 @@ mod test { .into_iter() .collect(); let dict = dict_encode(&arr.into_array(), &mut SESSION.create_execution_ctx()).unwrap(); - #[expect(deprecated)] - let codes = dict.codes().to_primitive(); + let codes = dict + .codes() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_eq!(codes.as_slice::(), &[0, 1, 2, 0, 1, 3, 2, 1]); - #[expect(deprecated)] - let values = dict.values().to_varbinview(); + let values = dict + .values() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); values.with_iterator(|iter| { assert_eq!( iter.map(|b| b.map(|v| unsafe { str::from_utf8_unchecked(v) })) @@ -272,8 +369,11 @@ mod test { fn repeated_values() { let arr = VarBinArray::from(vec!["a", "a", "b", "b", "a", "b", "a", "b"]); let dict = dict_encode(&arr.into_array(), &mut SESSION.create_execution_ctx()).unwrap(); - #[expect(deprecated)] - let values = dict.values().to_varbinview(); + let values = dict + .values() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); values.with_iterator(|iter| { assert_eq!( iter.flatten() @@ -282,8 +382,11 @@ mod test { vec!["a", "b"] ); }); - #[expect(deprecated)] - let codes = dict.codes().to_primitive(); + let codes = dict + .codes() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); assert_eq!(codes.as_slice::(), &[0, 0, 1, 1, 0, 1, 0, 1]); } } diff --git a/vortex-array/src/builders/dict/mod.rs b/vortex-array/src/builders/dict/mod.rs index c64e436bb45..94834f83cc4 100644 --- a/vortex-array/src/builders/dict/mod.rs +++ b/vortex-array/src/builders/dict/mod.rs @@ -35,7 +35,7 @@ pub const UNCONSTRAINED: DictConstraints = DictConstraints { pub trait DictEncoder: Send { /// Assign dictionary codes to the given input array. - fn encode(&mut self, array: &ArrayRef) -> ArrayRef; + fn encode(&mut self, array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult; /// Clear the encoder state to make it ready for a new round of decoding. fn reset(&mut self) -> ArrayRef; @@ -68,8 +68,7 @@ pub fn dict_encode_with_constraints( ctx: &mut ExecutionCtx, ) -> VortexResult { let mut encoder = dict_encoder(array, constraints); - let encoded = encoder.encode(array); - let codes = encoded.execute::(ctx)?.narrow(ctx)?; + let codes = encoder.encode(array, ctx)?.narrow(ctx)?; // SAFETY: The encoding process will produce a value set of codes and values // All values in the dictionary are guaranteed to be referenced by at least one code // since we build the dictionary from the codes we observe during encoding diff --git a/vortex-array/src/builders/dict/primitive.rs b/vortex-array/src/builders/dict/primitive.rs index 5e6d0be67ba..3d80cd4723f 100644 --- a/vortex-array/src/builders/dict/primitive.rs +++ b/vortex-array/src/builders/dict/primitive.rs @@ -1,23 +1,25 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::cell::OnceCell; use std::hash::Hash; use std::mem; use rustc_hash::FxBuildHasher; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; use vortex_error::vortex_panic; +use vortex_mask::Mask; use vortex_utils::aliases::hash_map::Entry; use vortex_utils::aliases::hash_map::HashMap; use super::DictConstraints; use super::DictEncoder; use crate::ArrayRef; +use crate::ExecutionCtx; use crate::IntoArray; -#[expect(deprecated)] -use crate::ToCanonical as _; -use crate::accessor::ArrayAccessor; use crate::arrays::PrimitiveArray; use crate::arrays::primitive::NativeValue; use crate::dtype::NativePType; @@ -72,6 +74,7 @@ where .min(constraints.max_bytes / T::PTYPE.byte_width()); Self { lookup: HashMap::with_hasher(FxBuildHasher), + null_code: OnceCell::new(), values: BufferMut::::empty(), values_nulls: BitBufferMut::empty(), nullability, @@ -79,8 +82,8 @@ where } } - fn encode_value(&mut self, v: Option) -> Option { - match self.lookup.entry(v.map(NativeValue)) { + fn encode_value(&mut self, v: T) -> Option { + match self.lookup.entry(NativeValue(v)) { Entry::Occupied(o) => Some(*o.get()), Entry::Vacant(vac) => { if self.values.len() >= self.max_dict_len { @@ -89,18 +92,9 @@ where let next_code = Code::from_usize(self.values.len()).unwrap_or_else(|| { vortex_panic!("{} has to fit into {}", self.values.len(), Code::PTYPE) }); - vac.insert(next_code); - match v { - None => { - self.values.push(T::default()); - self.values_nulls.append_false(); - } - Some(v) => { - self.values.push(v); - self.values_nulls.append_true(); - } - } - Some(next_code) + self.values.push(v); + self.values_nulls.append_true(); + Some(*vac.insert(next_code)) } } } @@ -110,7 +104,8 @@ where /// /// Null values are stored in the values of the dictionary such that codes are always non-null. pub struct PrimitiveDictBuilder { - lookup: HashMap>, Code, FxBuildHasher>, + lookup: HashMap, Code, FxBuildHasher>, + null_code: OnceCell, values: BufferMut, values_nulls: BitBufferMut, nullability: Nullability, @@ -123,21 +118,53 @@ where NativeValue: Hash + Eq, Code: UnsignedPType, { - fn encode(&mut self, array: &ArrayRef) -> ArrayRef { + fn encode(&mut self, array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let mut codes = BufferMut::::with_capacity(array.len()); - #[expect(deprecated)] - let prim = array.to_primitive(); - prim.with_iterator(|it| { - for value in it { - let Some(code) = self.encode_value(value.copied()) else { - break; - }; - unsafe { codes.push_unchecked(code) } + let prim = array.clone().execute::(ctx)?; + match prim.validity()?.execute_mask(array.len(), ctx)? { + Mask::AllTrue(_) => { + for &value in prim.as_slice::() { + let Some(code) = self.encode_value(value) else { + break; + }; + unsafe { codes.push_unchecked(code) } + } + } + Mask::AllFalse(_) => { + self.values.push(T::default()); + self.values_nulls.append_false(); + unsafe { + codes.push_n_unchecked( + Code::from_usize(0).vortex_expect("must fit 0"), + array.len(), + ) + } } - }); + Mask::Values(v) => { + let bit_buff = v.bit_buffer(); + for (&value, valid) in prim.as_slice::().iter().zip(bit_buff) { + if !valid { + let code = self.null_code.get_or_init(|| { + let code = self.values.len(); + self.values.push(T::default()); + self.values_nulls.append_false(); + Code::from_usize(code).unwrap_or_else(|| { + vortex_panic!("{} has to fit into {}", code, Code::PTYPE) + }) + }); + unsafe { codes.push_unchecked(*code) } + } else { + let Some(code) = self.encode_value(value) else { + break; + }; + unsafe { codes.push_unchecked(code) } + } + } + } + } - PrimitiveArray::new(codes, Validity::NonNullable).into_array() + Ok(PrimitiveArray::new(codes, Validity::NonNullable)) } fn reset(&mut self) -> ArrayRef { @@ -157,8 +184,6 @@ where mod test { use std::sync::LazyLock; - #[expect(unused_imports)] - use itertools::Itertools; use vortex_buffer::buffer; use vortex_session::VortexSession; @@ -196,8 +221,9 @@ mod test { None, Some(3), None, - ]); - let dict = dict_encode(&arr.into_array(), &mut SESSION.create_execution_ctx()).unwrap(); + ]) + .into_array(); + let dict = dict_encode(&arr, &mut SESSION.create_execution_ctx()).unwrap(); let expected_codes = buffer![0u8, 0, 1, 2, 2, 1, 2, 1].into_array(); assert_arrays_eq!(dict.codes(), expected_codes, &mut ctx); diff --git a/vortex-array/src/scalar/arrow.rs b/vortex-array/src/scalar/arrow.rs index ea8a9decd34..3af9f842cad 100644 --- a/vortex-array/src/scalar/arrow.rs +++ b/vortex-array/src/scalar/arrow.rs @@ -557,10 +557,10 @@ mod tests { #[rstest] #[case(TimeUnit::Nanoseconds, "UTC", 1234567890000000000i64)] - #[case(TimeUnit::Microseconds, "EST", 1234567890000000i64)] + #[case(TimeUnit::Microseconds, "America/New_York", 1234567890000000i64)] #[case(TimeUnit::Microseconds, "Asia/Qatar", 1234567890000000i64)] #[case(TimeUnit::Microseconds, "Australia/Sydney", 1234567890000000i64)] - #[case(TimeUnit::Milliseconds, "HST", 1234567890000i64)] + #[case(TimeUnit::Milliseconds, "Pacific/Honolulu", 1234567890000i64)] #[case(TimeUnit::Seconds, "GMT", 1234567890i64)] fn test_temporal_timestamp_tz_to_arrow( #[case] time_unit: TimeUnit, diff --git a/vortex-buffer/src/trusted_len.rs b/vortex-buffer/src/trusted_len.rs index 13cf25d0546..7ef6f682be4 100644 --- a/vortex-buffer/src/trusted_len.rs +++ b/vortex-buffer/src/trusted_len.rs @@ -1,8 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use itertools::ProcessResults; - /// Trait for all types which have a known upper-bound. /// /// Functions that receive a `TrustedLen` iterator can assume that it's `size_hint` is exact, @@ -146,7 +144,7 @@ unsafe impl TrustedLen for crate::Iter<'_, T> {} unsafe impl TrustedLen for crate::BufferIterator {} // ProcessResults -unsafe impl<'a, I, T: 'a, E: 'a> TrustedLen for ProcessResults<'a, I, E> where +unsafe impl<'a, I, T: 'a, E: 'a> TrustedLen for itertools::ProcessResults<'a, I, E> where I: TrustedLen> { } @@ -158,7 +156,7 @@ unsafe impl TrustedLen for std::iter::Enumerate where I: TrustedLen TrustedLen for std::iter::Zip where T: TrustedLen, - U: TrustedLen, + U: Iterator, { } diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml index 2d28d86a5e6..317ddeca61b 100644 --- a/vortex-compressor/Cargo.toml +++ b/vortex-compressor/Cargo.toml @@ -26,6 +26,15 @@ vortex-error = { workspace = true } vortex-mask = { workspace = true } vortex-utils = { workspace = true } +# Cloudflare's `cardinality-estimator` uses a tagged-pointer representation gated on a 64-bit +# pointer width, so it cannot compile on 32-bit targets. On those platforms we fall back to the +# pure `hyperloglogplus` crate; the two backends are unified by the wrapper in `stats::cardinality`. +[target.'cfg(target_pointer_width = "64")'.dependencies] +cardinality-estimator = { workspace = true } + +[target.'cfg(not(target_pointer_width = "64"))'.dependencies] +hyperloglogplus = { workspace = true } + [dev-dependencies] divan = { workspace = true } rstest = { workspace = true } diff --git a/vortex-compressor/benches/dict_encode.rs b/vortex-compressor/benches/dict_encode.rs index 5b2b0650ffd..2c4e24108a7 100644 --- a/vortex-compressor/benches/dict_encode.rs +++ b/vortex-compressor/benches/dict_encode.rs @@ -13,8 +13,6 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; use vortex_buffer::BufferMut; -use vortex_compressor::builtins::integer_dictionary_encode; -use vortex_compressor::stats::IntegerStats; use vortex_session::VortexSession; static SESSION: LazyLock = LazyLock::new(vortex_array::array_session); @@ -42,16 +40,6 @@ fn encode_generic(bencher: Bencher) { .bench_refs(|(array, ctx)| dict_encode(array, ctx).unwrap()); } -#[cfg(not(codspeed))] -#[divan::bench] -fn encode_specialized(bencher: Bencher) { - let array = make_array(); - let stats = IntegerStats::generate(&array, &mut SESSION.create_execution_ctx()); - bencher - .with_inputs(|| &stats) - .bench_refs(|stats| integer_dictionary_encode(array.as_view(), stats)); -} - fn main() { divan::main() } diff --git a/vortex-compressor/src/builtins/constant/binary.rs b/vortex-compressor/src/builtins/constant/binary.rs index 4ad24fe57b5..6b1395ed0fd 100644 --- a/vortex-compressor/src/builtins/constant/binary.rs +++ b/vortex-compressor/src/builtins/constant/binary.rs @@ -48,7 +48,7 @@ impl Scheme for BinaryConstantScheme { // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count() as usize, array_len); + debug_assert_eq!(stats.null_count(), array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } diff --git a/vortex-compressor/src/builtins/constant/string.rs b/vortex-compressor/src/builtins/constant/string.rs index f55c1661660..da6b8419327 100644 --- a/vortex-compressor/src/builtins/constant/string.rs +++ b/vortex-compressor/src/builtins/constant/string.rs @@ -48,12 +48,13 @@ impl Scheme for StringConstantScheme { // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count() as usize, array_len); + debug_assert_eq!(stats.null_count(), array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } - // Since the estimated distinct count is always going to be less than or equal to the actual - // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. + // The estimator is exact for small cardinalities, so a constant array (exactly one + // distinct value) is always reported as 1. An estimate above 1 therefore means the array + // is genuinely not constant; an estimate of 1 still falls through to the exact check below. if stats.estimated_distinct_count().is_some_and(|c| c > 1) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs index 5288637a609..1af27466e67 100644 --- a/vortex-compressor/src/builtins/dict/float.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -7,19 +7,15 @@ //! external compatibility. use vortex_array::ArrayRef; -use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::DictArray; -use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; -use vortex_array::dtype::half::f16; -use vortex_array::validity::Validity; -use vortex_buffer::Buffer; +use vortex_array::builders::dict::dict_encode; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -34,8 +30,6 @@ use crate::scheme::DescendantExclusion; use crate::scheme::Scheme; use crate::scheme::SchemeExt; use crate::stats::ArrayAndStats; -use crate::stats::FloatErasedStats; -use crate::stats::FloatStats; use crate::stats::GenerateStatsOptions; /// Dictionary encoding for low-cardinality float values. @@ -115,8 +109,7 @@ impl Scheme for FloatDictScheme { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let stats = data.float_stats(exec_ctx); - let dict = dictionary_encode(data.array_as_primitive(), &stats)?; + let dict = dict_encode(data.array(), exec_ctx)?; let has_all_values_referenced = dict.has_all_values_referenced(); @@ -145,106 +138,6 @@ impl Scheme for FloatDictScheme { } } -/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. -macro_rules! typed_encode { - ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ - let distinct = $typed.distinct().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); - - let values_validity = match $source_array.validity()? { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, - }; - let codes_validity = $source_array.validity()?; - - let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); - - let max_code = values.len(); - let codes = if max_code <= u8::MAX as usize { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - } else if max_code <= u16::MAX as usize { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - } else { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - }; - - let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: enforced by the DictEncoder. - Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) }) - }}; -} - -/// Compresses a floating-point array into a dictionary array according to attached stats. -/// -/// # Errors -/// -/// Returns an error if unable to compute validity. -pub fn dictionary_encode( - array: ArrayView<'_, Primitive>, - stats: &FloatStats, -) -> VortexResult { - match stats.erased() { - FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16), - FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32), - FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64), - } -} - -/// Stateless encoder that maps values to dictionary codes via a `HashMap`. -struct DictEncoder; - -/// Trait for encoding values of type `T` into codes of type `I`. -trait Encode { - /// Using the distinct value set, turn the values into a set of codes. - fn encode(distinct: &[T], values: &[T]) -> Buffer; -} - -/// Implements [`Encode`] for a float type using its bit representation as the hash key. -macro_rules! impl_encode { - ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; - ($typ:ty, $utyp:ty, $($ityp:ty),+) => { - $( - impl Encode<$typ, $ityp> for DictEncoder { - #[expect(clippy::cast_possible_truncation)] - fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { - let mut codes = - vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity( - distinct.len(), - ); - for (code, &value) in distinct.iter().enumerate() { - codes.insert(value.to_bits(), code as $ityp); - } - - let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); - for value in values { - // Any code lookups which fail are for nulls, so their value does not matter. - output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); - } - - output.freeze() - } - } - )* - }; -} - -impl_encode!(f16, u16); -impl_encode!(f32, u32); -impl_encode!(f64, u64); - #[cfg(test)] mod tests { use vortex_array::IntoArray; @@ -253,35 +146,25 @@ mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::assert_arrays_eq; + use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; - use super::dictionary_encode; - use crate::stats::FloatStats; - use crate::stats::GenerateStatsOptions; - #[test] fn test_float_dict_encode() -> VortexResult<()> { let mut ctx = vortex_array::array_session().create_execution_ctx(); let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(values, validity); + let array = PrimitiveArray::new(values, validity).into_array(); - let stats = FloatStats::generate_opts( - &array, - GenerateStatsOptions { - count_distinct_values: true, - }, - &mut ctx, - ); - let dict_array = dictionary_encode(array.as_view(), &stats)?; - assert_eq!(dict_array.values().len(), 2); + let dict_array = dict_encode(&array, &mut ctx)?; + assert_eq!(dict_array.values().len(), 3); assert_eq!(dict_array.codes().len(), 5); let expected = PrimitiveArray::new( - buffer![1f32, 2f32, 2f32, 1f32, 1f32], + buffer![1f32, 2f32, 2f32, 0f32, 1f32], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index 8ec3bf53345..10387756be8 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -7,18 +7,15 @@ //! for external compatibility. use vortex_array::ArrayRef; -use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::DictArray; -use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; -use vortex_array::validity::Validity; -use vortex_buffer::Buffer; +use vortex_array::builders::dict::dict_encode; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -30,8 +27,6 @@ use crate::scheme::Scheme; use crate::scheme::SchemeExt; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; -use crate::stats::IntegerErasedStats; -use crate::stats::IntegerStats; /// Dictionary encoding for low-cardinality integer values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -109,8 +104,7 @@ impl Scheme for IntDictScheme { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let stats = data.integer_stats(exec_ctx); - let dict = dictionary_encode(data.array_as_primitive(), &stats)?; + let dict = dict_encode(data.array(), exec_ctx)?; // Values = child 0. let compressed_values = @@ -137,121 +131,6 @@ impl Scheme for IntDictScheme { } } -/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. -macro_rules! typed_encode { - ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ - let distinct = $typed.distinct().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); - - let values_validity = match $source_array.validity()? { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, - }; - let codes_validity = $source_array.validity()?; - - let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); - - let max_code = values.len(); - let codes = if max_code <= u8::MAX as usize { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - } else if max_code <= u16::MAX as usize { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - } else { - let buf = >::encode( - &values, - $source_array.as_slice::<$typ>(), - ); - PrimitiveArray::new(buf, codes_validity).into_array() - }; - - let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: invariants enforced in DictEncoder. - Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) }) - }}; -} - -/// Compresses an integer array into a dictionary array according to attached stats. -/// -/// # Errors -/// -/// Returns an error if unable to compute validity. -#[expect( - clippy::cognitive_complexity, - reason = "complexity from match on all integer types" -)] -pub fn dictionary_encode( - array: ArrayView<'_, Primitive>, - stats: &IntegerStats, -) -> VortexResult { - match stats.erased() { - IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8), - IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16), - IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32), - IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64), - IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8), - IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16), - IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32), - IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64), - } -} - -/// Stateless encoder that maps values to dictionary codes via a `HashMap`. -struct DictEncoder; - -/// Trait for encoding values of type `T` into codes of type `I`. -trait Encode { - /// Using the distinct value set, turn the values into a set of codes. - fn encode(distinct: &[T], values: &[T]) -> Buffer; -} - -/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32). -macro_rules! impl_encode { - ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; - ($typ:ty, $($ityp:ty),+) => { - $( - impl Encode<$typ, $ityp> for DictEncoder { - #[expect(clippy::cast_possible_truncation)] - fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { - let mut codes = - vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity( - distinct.len(), - ); - for (code, &value) in distinct.iter().enumerate() { - codes.insert(value, code as $ityp); - } - - let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); - for value in values { - // Any code lookups which fail are for nulls, so their value does not matter. - // SAFETY: we have exactly sized output to be as large as values. - unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; - } - - output.freeze() - } - } - )* - }; -} - -impl_encode!(u8); -impl_encode!(u16); -impl_encode!(u32); -impl_encode!(u64); -impl_encode!(i8); -impl_encode!(i16); -impl_encode!(i32); -impl_encode!(i64); - #[cfg(test)] mod tests { use vortex_array::IntoArray; @@ -260,34 +139,25 @@ mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::assert_arrays_eq; + use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; - use super::dictionary_encode; - use crate::stats::IntegerStats; - #[test] fn test_dict_encode_integer_stats() -> VortexResult<()> { let mut ctx = vortex_array::array_session().create_execution_ctx(); let data = buffer![100i32, 200, 100, 0, 100]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(data, validity); + let array = PrimitiveArray::new(data, validity).into_array(); - let stats = IntegerStats::generate_opts( - &array, - crate::stats::GenerateStatsOptions { - count_distinct_values: true, - }, - &mut ctx, - ); - let dict_array = dictionary_encode(array.as_view(), &stats)?; - assert_eq!(dict_array.values().len(), 2); + let dict_array = dict_encode(&array, &mut ctx)?; + assert_eq!(dict_array.values().len(), 3); assert_eq!(dict_array.codes().len(), 5); let expected = PrimitiveArray::new( - buffer![100i32, 200, 100, 100, 100], + buffer![100i32, 200, 100, 0, 100], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index 4862df2b211..dd5fa0227ed 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -10,7 +10,5 @@ mod string; pub use binary::BinaryDictScheme; pub use float::FloatDictScheme; -pub use float::dictionary_encode as float_dictionary_encode; pub use integer::IntDictScheme; -pub use integer::dictionary_encode as integer_dictionary_encode; pub use string::StringDictScheme; diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs index d014f1a6cf3..499c2c6d6a7 100644 --- a/vortex-compressor/src/builtins/mod.rs +++ b/vortex-compressor/src/builtins/mod.rs @@ -16,8 +16,6 @@ pub use dict::BinaryDictScheme; pub use dict::FloatDictScheme; pub use dict::IntDictScheme; pub use dict::StringDictScheme; -pub use dict::float_dictionary_encode; -pub use dict::integer_dictionary_encode; mod constant; diff --git a/vortex-compressor/src/stats/bool.rs b/vortex-compressor/src/stats/bool.rs index fbbc4001b90..0a9931e240a 100644 --- a/vortex-compressor/src/stats/bool.rs +++ b/vortex-compressor/src/stats/bool.rs @@ -92,18 +92,25 @@ impl BoolStats { #[cfg(test)] mod tests { + use std::sync::LazyLock; + use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::BoolArray; + use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_error::VortexResult; + use vortex_session::VortexSession; use super::BoolStats; + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + #[test] fn test_all_true() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, true, true]), Validity::NonNullable, @@ -118,7 +125,7 @@ mod tests { #[test] fn test_all_false() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![false, false, false]), Validity::NonNullable, @@ -133,7 +140,7 @@ mod tests { #[test] fn test_mixed() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::NonNullable, @@ -148,7 +155,7 @@ mod tests { #[test] fn test_with_nulls() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::from_iter([true, false, true]), diff --git a/vortex-compressor/src/stats/cardinality.rs b/vortex-compressor/src/stats/cardinality.rs new file mode 100644 index 00000000000..a17dc40d018 --- /dev/null +++ b/vortex-compressor/src/stats/cardinality.rs @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Platform-portable cardinality estimator. +//! +//! On 64-bit targets this delegates to Cloudflare's +//! [`cardinality_estimator`](https://crates.io/crates/cardinality-estimator), which is exact up +//! to ~128 distinct values and then transitions to a HyperLogLog++ representation. That crate is +//! gated by `#[cfg(target_pointer_width = "64")]` because it stores its state in a tagged +//! `usize`, so it cannot compile on 32-bit platforms. To keep the compressor portable we fall +//! back to the pure-Rust [`hyperloglogplus`](https://crates.io/crates/hyperloglogplus) crate on +//! non-64-bit targets, using a HyperLogLog++ with sparse representation that gives comparable +//! quality within the standard HLL++ error bound (~1.6% at the default precision). + +use std::hash::Hash; + +/// Approximate distinct-count estimator with a tiny, stable surface area. +/// +/// The estimator is exact for small cardinalities on 64-bit targets and approximate beyond. On +/// non-64-bit targets it is a HyperLogLog++ sketch throughout; collisions in the sparse +/// representation make small-cardinality estimates approximate as well, though the error stays +/// well within the standard HLL++ bound. +pub(crate) struct CardinalityEstimator { + /// Platform-selected backend (Cloudflare on 64-bit, HLL++ elsewhere). + inner: inner::Estimator, +} + +impl CardinalityEstimator { + /// Create a new estimator with the default precision. + #[inline] + pub(crate) fn new() -> Self { + Self { + inner: inner::Estimator::new(), + } + } + + /// Insert a hashable item into the estimator. + #[inline] + pub(crate) fn insert(&mut self, item: &T) { + self.inner.insert(item); + } + + /// Return the current cardinality estimate. + /// + /// Takes `&mut self` because the 32-bit fallback's `count` implementation mutates internal + /// caches; the 64-bit implementation is logically `&self` but is wrapped uniformly. + #[inline] + pub(crate) fn estimate(&mut self) -> usize { + self.inner.estimate() + } +} + +impl Default for CardinalityEstimator { + fn default() -> Self { + Self::new() + } +} + +/// Backend implementations selected at compile time by pointer width. +#[cfg(target_pointer_width = "64")] +mod inner { + use std::hash::Hash; + + /// Thin wrapper around Cloudflare's tagged-pointer cardinality estimator. + pub(super) struct Estimator { + /// The Cloudflare estimator using its default `P=12, W=6` parameters. + inner: cardinality_estimator::CardinalityEstimator, + } + + impl Estimator { + /// Construct a fresh estimator at default precision. + #[inline] + pub(super) fn new() -> Self { + Self { + inner: cardinality_estimator::CardinalityEstimator::new(), + } + } + + /// Forward an insertion to the underlying estimator. + #[inline] + pub(super) fn insert(&mut self, item: &T) { + self.inner.insert(item); + } + + /// Forward to the underlying constant-time estimate. + #[inline] + pub(super) fn estimate(&mut self) -> usize { + self.inner.estimate() + } + } +} + +/// 32-bit fallback using the `hyperloglogplus` crate. +#[cfg(not(target_pointer_width = "64"))] +mod inner { + use std::hash::BuildHasherDefault; + use std::hash::Hash; + + use hyperloglogplus::HyperLogLog as _; + use hyperloglogplus::HyperLogLogPlus; + use rustc_hash::FxHasher; + use vortex_error::VortexExpect; + + /// HLL++ precision exponent: 2^12 = 4096 registers, matching Cloudflare's default (~1.6% error). + const PRECISION: u8 = 12; + + /// HyperLogLog++ sketch over `T`, hashed with a deterministic `FxHasher`. + pub(super) struct Estimator { + /// Backing HLL++ sketch. + inner: HyperLogLogPlus>, + } + + impl Estimator { + /// Construct a fresh estimator at the configured `PRECISION`. + #[inline] + pub(super) fn new() -> Self { + // `HyperLogLogPlus::new` only fails if precision is outside `[4, 18]`; 12 is a + // compile-time constant inside that range. + let inner = HyperLogLogPlus::new(PRECISION, BuildHasherDefault::default()) + .ok() + .vortex_expect("HyperLogLogPlus precision constant is in [4, 18]"); + Self { inner } + } + + /// Hash and absorb the item into the sketch. + #[inline] + pub(super) fn insert(&mut self, item: &T) { + self.inner.insert(item); + } + + /// Compute the current cardinality estimate. + #[inline] + pub(super) fn estimate(&mut self) -> usize { + let count = self.inner.count(); + // `count` returns a non-negative `f64`; round to the nearest integer and clamp to + // `usize` to match the 64-bit backend's return type. + #[expect( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + reason = "HLL++ count is non-negative; truncation matches the 64-bit backend's usize semantics" + )] + let rounded = count.max(0.0).round() as usize; + rounded + } + } +} diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index 9d94038af18..26f3c22d503 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -4,10 +4,10 @@ //! Float compression statistics. use std::hash::Hash; +use std::marker::PhantomData; use itertools::Itertools; use num_traits::Float; -use rustc_hash::FxBuildHasher; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; @@ -19,24 +19,20 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_error::vortex_panic; use vortex_mask::AllOr; -use vortex_utils::aliases::hash_set::HashSet; use super::GenerateStatsOptions; +use super::cardinality::CardinalityEstimator; /// Information about the distinct values in a float array. +/// +/// The distinct count is an estimate produced by Cloudflare's cardinality estimator, which is +/// exact for small cardinalities and approximate beyond that. #[derive(Debug, Clone)] pub struct DistinctInfo { - /// The set of distinct float values. - distinct_values: HashSet, FxBuildHasher>, - /// The count of unique values. This _must_ be non-zero. + /// The estimated count of unique values. This _must_ be non-zero. distinct_count: u32, -} - -impl DistinctInfo { - /// Returns a reference to the distinct values set. - pub fn distinct_values(&self) -> &HashSet, FxBuildHasher> { - &self.distinct_values - } + /// Phantom marker for the float element type. + _marker: PhantomData, } /// Typed statistics for a specific float type. @@ -188,8 +184,8 @@ where average_run_length: 0, erased: TypedStats { distinct: Some(DistinctInfo { - distinct_values: HashSet::with_capacity_and_hasher(0, FxBuildHasher), distinct_count: 0, + _marker: PhantomData, }), } .into(), @@ -202,13 +198,9 @@ where .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; let value_count = array.len() - null_count; - // Keep a HashMap of T, then convert the keys into PValue afterward since value is - // so much more efficient to hash and search for. - let mut distinct_values = if count_distinct_values { - HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) - } else { - HashSet::with_hasher(FxBuildHasher) - }; + // Cloudflare's cardinality estimator gives us a bounded-memory approximation of the + // number of distinct values, replacing the previous exact `HashSet`. + let mut estimator: CardinalityEstimator> = CardinalityEstimator::new(); let validity = array .as_ref() @@ -227,7 +219,7 @@ where AllOr::All => { for value in first_valid_buff { if count_distinct_values { - distinct_values.insert(NativeValue(value)); + estimator.insert(&NativeValue(value)); } if value != prev { @@ -244,7 +236,7 @@ where { if valid { if count_distinct_values { - distinct_values.insert(NativeValue(value)); + estimator.insert(&NativeValue(value)); } if value != prev { @@ -260,9 +252,10 @@ where let value_count = u32::try_from(value_count)?; let distinct = count_distinct_values.then(|| DistinctInfo { - distinct_count: u32::try_from(distinct_values.len()) - .vortex_expect("more than u32::MAX distinct values"), - distinct_values, + distinct_count: u32::try_from(estimator.estimate()) + .vortex_expect("more than u32::MAX distinct values") + .max(1), + _marker: PhantomData, }); Ok(FloatStats { @@ -275,25 +268,33 @@ where #[cfg(test)] mod tests { + use std::sync::LazyLock; + use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; + use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; + use vortex_session::VortexSession; use super::FloatStats; + use crate::stats::GenerateStatsOptions; + + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); #[test] fn test_float_stats() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); let floats = floats.execute::(&mut ctx)?; let stats = FloatStats::generate_opts( &floats, - crate::stats::GenerateStatsOptions { + GenerateStatsOptions { count_distinct_values: true, }, &mut ctx, @@ -308,7 +309,7 @@ mod tests { #[test] fn test_float_stats_leading_nulls() { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let floats = PrimitiveArray::new( buffer![0.0f32, 1.0f32, 2.0f32], Validity::from_iter([false, true, true]), @@ -316,7 +317,7 @@ mod tests { let stats = FloatStats::generate_opts( &floats, - crate::stats::GenerateStatsOptions { + GenerateStatsOptions { count_distinct_values: true, }, &mut ctx, diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index a399c1060f9..204bb5ad97f 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -6,7 +6,6 @@ use std::hash::Hash; use num_traits::PrimInt; -use rustc_hash::FxBuildHasher; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; @@ -20,30 +19,28 @@ use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_mask::AllOr; -use vortex_utils::aliases::hash_map::HashMap; use super::GenerateStatsOptions; +use super::cardinality::CardinalityEstimator; /// Information about the distinct values in an integer array. +/// +/// The `distinct_count` is an estimate computed using Cloudflare's cardinality estimator, which +/// yields exact counts for small cardinalities (<= 128 for the default parameters) and a +/// HyperLogLog++ approximation for larger cardinalities. The most frequent value is tracked using +/// the Boyer-Moore majority candidate algorithm, so `most_frequent_value` and `top_frequency` are +/// only guaranteed to reflect the true majority element when some value accounts for more than +/// half of the non-null entries; otherwise they are treated as a best-effort estimate. #[derive(Debug, Clone)] pub struct DistinctInfo { - /// The unique values and their occurrences. - distinct_values: HashMap, u32, FxBuildHasher>, - /// The count of unique values. This _must_ be non-zero. + /// The estimated count of unique values. This _must_ be non-zero. distinct_count: u32, - /// The most frequent value. + /// The most frequent value (Boyer-Moore majority candidate). most_frequent_value: T, - /// The number of times the most frequent value occurs. + /// The exact number of times `most_frequent_value` occurs in the array. top_frequency: u32, } -impl DistinctInfo { - /// Returns a reference to the distinct values map. - pub fn distinct_values(&self) -> &HashMap, u32, FxBuildHasher> { - &self.distinct_values - } -} - /// Typed statistics for a specific integer type. #[derive(Debug, Clone)] pub struct TypedStats { @@ -346,7 +343,6 @@ where min: T::max_value(), max: T::min_value(), distinct: Some(DistinctInfo { - distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher), distinct_count: 0, most_frequent_value: T::zero(), top_frequency: 0, @@ -370,12 +366,10 @@ where let buffer = array.to_buffer::(); let head = buffer[head_idx]; - let mut loop_state = LoopState { - distinct_values: if count_distinct_values { - HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) - } else { - HashMap::with_hasher(FxBuildHasher) - }, + let mut loop_state = LoopState:: { + estimator: CardinalityEstimator::new(), + bm_candidate: head, + bm_votes: 0, prev: head, runs: 1, }; @@ -450,18 +444,25 @@ where .vortex_expect("max should be computed"); let distinct = count_distinct_values.then(|| { - let (&top_value, &top_count) = loop_state - .distinct_values - .iter() - .max_by_key(|&(_, &count)| count) - .vortex_expect("we know this is non-empty"); + // The cardinality estimator is exact for small cardinalities and approximate beyond. + // We clamp to at least 1 because we are inside the non-empty/non-all-null branch. + let distinct_count = u32::try_from(loop_state.estimator.estimate()) + .vortex_expect("there are more than `u32::MAX` distinct values") + .max(1); + + // Count the Boyer-Moore majority candidate exactly via a second pass. If any value + // accounts for more than half of the non-null entries, this counts that value; otherwise + // the returned count is a best-effort estimate for whichever candidate survived. + let top_frequency = count_occurrences::( + buffer.as_slice(), + validity.bit_buffer(), + loop_state.bm_candidate, + ); DistinctInfo { - distinct_count: u32::try_from(loop_state.distinct_values.len()) - .vortex_expect("there are more than `u32::MAX` distinct values"), - most_frequent_value: top_value.0, - top_frequency: top_count, - distinct_values: loop_state.distinct_values, + distinct_count, + most_frequent_value: loop_state.bm_candidate, + top_frequency, } }); @@ -479,13 +480,54 @@ where } /// Internal loop state for integer stats computation. -struct LoopState { +struct LoopState +where + T: IntegerPType, + NativeValue: Eq + Hash, +{ /// The previous value seen. prev: T, /// The run count. runs: u32, - /// The distinct values map. - distinct_values: HashMap, u32, FxBuildHasher>, + /// Cloudflare's cardinality estimator, used to approximate the number of distinct values + /// without materializing an exact hash map. + estimator: CardinalityEstimator>, + /// Boyer-Moore majority candidate; holds the current candidate for the most frequent value. + bm_candidate: T, + /// Boyer-Moore vote counter for `bm_candidate`. + bm_votes: u32, +} + +/// Updates the Boyer-Moore majority-vote state for a single value. +#[inline(always)] +fn boyer_moore_observe(state: &mut LoopState, value: T) +where + T: IntegerPType, + NativeValue: Eq + Hash, +{ + if state.bm_votes == 0 { + state.bm_candidate = value; + state.bm_votes = 1; + } else if value == state.bm_candidate { + state.bm_votes += 1; + } else { + state.bm_votes -= 1; + } +} + +/// Counts exact occurrences of `needle` in `buffer`, restricted to valid positions according to +/// `validity`. +fn count_occurrences(buffer: &[T], validity: AllOr<&BitBuffer>, needle: T) -> u32 { + let count = match validity { + AllOr::All => buffer.iter().filter(|&&v| v == needle).count(), + AllOr::None => 0, + AllOr::Some(mask) => buffer + .iter() + .enumerate() + .filter(|&(idx, &v)| mask.value(idx) && v == needle) + .count(), + }; + u32::try_from(count).vortex_expect("occurrences cannot exceed `u32::MAX`") } /// Inner loop for non-null chunks of 64 values. @@ -499,7 +541,8 @@ fn inner_loop_nonnull( { for &value in values { if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + state.estimator.insert(&NativeValue(value)); + boyer_moore_observe(state, value); } if value != state.prev { @@ -522,7 +565,8 @@ fn inner_loop_nullable( for (idx, &value) in values.iter().enumerate() { if is_valid.value(idx) { if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + state.estimator.insert(&NativeValue(value)); + boyer_moore_observe(state, value); } if value != state.prev { @@ -546,7 +590,8 @@ fn inner_loop_naive( for (idx, &value) in values.iter().enumerate() { if is_valid.value(idx) { if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + state.estimator.insert(&NativeValue(value)); + boyer_moore_observe(state, value); } if value != state.prev { @@ -560,22 +605,27 @@ fn inner_loop_naive( #[cfg(test)] mod tests { use std::iter; + use std::sync::LazyLock; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; + use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; + use vortex_session::VortexSession; use super::IntegerStats; use super::typed_int_stats; + static SESSION: LazyLock = LazyLock::new(array_session); + #[test] fn test_naive_count_distinct_values() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 2); @@ -584,7 +634,7 @@ mod tests { #[test] fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = PrimitiveArray::new( buffer![217u8, 0], Validity::from(BitBuffer::from(vec![true, false])), @@ -596,7 +646,7 @@ mod tests { #[test] fn test_count_distinct_values() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 128); @@ -605,7 +655,7 @@ mod tests { #[test] fn test_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let array = PrimitiveArray::new( (0..128u8).collect::>(), Validity::from(BitBuffer::from_iter( @@ -619,7 +669,7 @@ mod tests { #[test] fn test_integer_stats_leading_nulls() { - let mut ctx = array_session().create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); let stats = IntegerStats::generate_opts( @@ -635,4 +685,40 @@ mod tests { assert_eq!(stats.average_run_length, 1); assert_eq!(stats.distinct_count().unwrap(), 2); } + + #[test] + fn test_most_frequent_value_dominates() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + // A value that appears in 95% of the array must be recovered exactly by the + // Boyer-Moore tracking plus second-pass count. + let top = -1i32; + let mut data: Vec = vec![top; 950]; + data.extend(0..50i32); + let array = PrimitiveArray::new(Buffer::copy_from(&data), Validity::NonNullable); + let stats = typed_int_stats::(&array, true, &mut ctx)?; + let (top_value, top_count) = stats + .erased() + .most_frequent_value_and_count() + .expect("distinct info must be present"); + assert_eq!(top_value, top.into()); + assert_eq!(top_count, 950); + Ok(()) + } + + #[test] + fn test_cardinality_estimate_large_unique() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + // For 1024 distinct values the estimator falls back to HyperLogLog++; verify the + // estimate is within the expected error bound (~1.6% for the default P/W). + let array = + PrimitiveArray::new((0..1024u32).collect::>(), Validity::NonNullable); + let stats = typed_int_stats::(&array, true, &mut ctx)?; + let estimated = stats.distinct_count().unwrap(); + let error_ratio = (estimated as f64 - 1024.0).abs() / 1024.0; + assert!( + error_ratio < 0.05, + "estimator error {error_ratio} exceeds 5% for 1024 distinct values" + ); + Ok(()) + } } diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs index 8f6e7720837..1449846215a 100644 --- a/vortex-compressor/src/stats/mod.rs +++ b/vortex-compressor/src/stats/mod.rs @@ -5,6 +5,7 @@ mod bool; mod cache; +mod cardinality; mod float; mod integer; mod options; diff --git a/vortex-compressor/src/stats/varbinview.rs b/vortex-compressor/src/stats/varbinview.rs index 486d431e684..6ea747914c8 100644 --- a/vortex-compressor/src/stats/varbinview.rs +++ b/vortex-compressor/src/stats/varbinview.rs @@ -5,42 +5,82 @@ use vortex_array::ExecutionCtx; use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::BinaryView; +use vortex_buffer::ByteBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_utils::aliases::hash_set::HashSet; +use vortex_mask::AllOr; use super::GenerateStatsOptions; +use super::cardinality::CardinalityEstimator; /// Array of variable-length byte/string values, and relevant stats for compression. #[derive(Clone, Debug)] pub struct StringStats { /// The estimated number of distinct values, or `None` if not computed. /// This _must_ be non-zero. - estimated_distinct_count: Option, + estimated_distinct_count: Option, /// The number of non-null values. - value_count: u32, + value_count: usize, /// The number of null values. - null_count: u32, + null_count: usize, } -/// Estimate the number of distinct values in the var bin view array. -fn estimate_distinct_count(varbinview: &VarBinViewArray) -> VortexResult { - let views = varbinview.views(); - // Iterate the views. Two values which are equal must have the same first 8-bytes. - // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all - // share a 4-byte prefix and have the same length. - let mut distinct = HashSet::with_capacity(views.len() / 2); - views.iter().for_each(|&view| { - #[expect( - clippy::cast_possible_truncation, - reason = "approximate uniqueness with view prefix" - )] - let len_and_prefix = view.as_u128() as u64; - distinct.insert(len_and_prefix); - }); +/// Estimate the number of distinct strings in the var bin view array using Cloudflare's +/// cardinality estimator. +/// +/// Every non-null value is hashed in full, so the estimate's only error is the estimator's own: +/// it is exact for small cardinalities and transitions to HyperLogLog++ for larger ones. Null +/// entries are skipped, matching the integer and float distinct-count stats. +fn estimate_distinct_count( + strings: &VarBinViewArray, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let views = strings.views(); + let validity = strings + .as_ref() + .validity()? + .execute_mask(strings.len(), ctx)?; + let mut estimator: CardinalityEstimator<[u8]> = CardinalityEstimator::new(); + let buffers = strings + .data_buffers() + .iter() + .map(|b| b.as_host()) + .collect::>(); - Ok(u32::try_from(distinct.len())?) + match validity.bit_buffer() { + AllOr::All => { + for view in views { + estimator.insert(view_bytes(&buffers, view)); + } + } + // Every value is null, so there is nothing to count. + AllOr::None => {} + AllOr::Some(is_valid) => { + for (idx, view) in views.iter().enumerate() { + if is_valid.value(idx) { + estimator.insert(view_bytes(&buffers, view)); + } + } + } + } + + Ok(estimator.estimate()) +} + +/// Returns the full bytes backing a single view, reading from the array's data buffers when the +/// value is not inlined. +/// +/// Only call this for non-null positions: a null slot's view may hold arbitrary bytes whose buffer +/// index and offset are not safe to dereference. +fn view_bytes<'a>(buffers: &[&'a ByteBuffer], view: &'a BinaryView) -> &'a [u8] { + if view.is_inlined() { + view.as_inlined().value() + } else { + let r = view.as_view(); + &buffers[r.buffer_index as usize][r.as_range()] + } } impl StringStats { @@ -57,13 +97,13 @@ impl StringStats { let value_count = input.len() - null_count; let estimated_distinct_count = opts .count_distinct_values - .then(|| estimate_distinct_count(input)) + .then(|| estimate_distinct_count(input, ctx)) .transpose()?; Ok(Self { - value_count: u32::try_from(value_count)?, - null_count: u32::try_from(null_count)?, estimated_distinct_count, + value_count, + null_count, }) } } @@ -86,18 +126,19 @@ impl StringStats { /// Returns the estimated number of distinct values, or `None` if not computed. /// - /// This estimation is always going to be less than or equal to the actual distinct count. - pub fn estimated_distinct_count(&self) -> Option { + /// The estimate is exact for small cardinalities and an approximation (which may be slightly + /// above or below the true count) for larger ones. + pub fn estimated_distinct_count(&self) -> Option { self.estimated_distinct_count } /// Returns the number of non-null values. - pub fn value_count(&self) -> u32 { + pub fn value_count(&self) -> usize { self.value_count } /// Returns the number of null values. - pub fn null_count(&self) -> u32 { + pub fn null_count(&self) -> usize { self.null_count } } diff --git a/vortex-ffi/src/data_source.rs b/vortex-ffi/src/data_source.rs index bdbb9b53972..8485be285bd 100644 --- a/vortex-ffi/src/data_source.rs +++ b/vortex-ffi/src/data_source.rs @@ -228,7 +228,7 @@ mod tests { assert_error(error); assert!(ds.is_null()); - opts.paths = c"*.vortex".as_ptr(); + opts.paths = c"definitely-missing-dir/*.vortex".as_ptr(); let ds = vx_data_source_new(session, &raw const opts, &raw mut error); assert_error(error); assert!(ds.is_null()); diff --git a/vortex-layout/src/layouts/dict/writer.rs b/vortex-layout/src/layouts/dict/writer.rs index 7013becd4df..f83c426b55b 100644 --- a/vortex-layout/src/layouts/dict/writer.rs +++ b/vortex-layout/src/layouts/dict/writer.rs @@ -20,6 +20,8 @@ use futures::stream::once; use futures::try_join; use vortex_array::ArrayContext; use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::Dict; use vortex_array::builders::dict::DictConstraints; @@ -557,7 +559,9 @@ fn encode_chunk( mut encoder: Box, chunk: &ArrayRef, ) -> VortexResult { - let encoded = encoder.encode(chunk); + let encoded = encoder + .encode(chunk, &mut LEGACY_SESSION.create_execution_ctx())? + .into_array(); match remainder(chunk, encoded.len())? { None => Ok(EncodingState::Continue((encoder, encoded))), Some(unencoded) => Ok(EncodingState::Done((encoder.reset(), encoded, unencoded))), diff --git a/vortex/benches/single_encoding_throughput.rs b/vortex/benches/single_encoding_throughput.rs index 3c58297e70b..cc81b52d081 100644 --- a/vortex/benches/single_encoding_throughput.rs +++ b/vortex/benches/single_encoding_throughput.rs @@ -208,10 +208,11 @@ fn bench_for_decompress_i32(bencher: Bencher) { #[divan::bench(name = "dict_compress_u32")] fn bench_dict_compress_u32(bencher: Bencher) { let (uint_array, ..) = setup_primitive_arrays(); + let array = uint_array.into_array(); with_byte_counter(bencher, NUM_VALUES * 4) - .with_inputs(|| (&uint_array, SESSION.create_execution_ctx())) - .bench_refs(|(a, ctx)| dict_encode(&a.clone().into_array(), ctx).unwrap()); + .with_inputs(|| (&array, SESSION.create_execution_ctx())) + .bench_refs(|(a, ctx)| dict_encode(a, ctx).unwrap()); } #[divan::bench(name = "dict_decompress_u32")] @@ -383,10 +384,11 @@ fn bench_dict_compress_string(bencher: Bencher) { let varbinview_arr = VarBinViewArray::from_iter_str(gen_varbin_words(NUM_VALUES as usize, 0.00005)); let nbytes = varbinview_arr.nbytes() as u64; + let array = varbinview_arr.into_array(); with_byte_counter(bencher, nbytes) - .with_inputs(|| (&varbinview_arr, SESSION.create_execution_ctx())) - .bench_refs(|(a, ctx)| dict_encode(&a.clone().into_array(), ctx).unwrap()); + .with_inputs(|| (&array, SESSION.create_execution_ctx())) + .bench_refs(|(a, ctx)| dict_encode(a, ctx).unwrap()); } #[divan::bench(name = "dict_decompress_string")] From 141eaa4e5f0e7339a982e775bc6b031cc68c2431 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 00:40:53 +0100 Subject: [PATCH 02/16] fixes Signed-off-by: Robert Kruszewski --- encodings/sequence/src/compress.rs | 52 +++++++++++++------ encodings/sequence/src/lib.rs | 1 + .../src/schemes/integer/sequence.rs | 14 ++--- vortex-compressor/src/stats/integer.rs | 44 ++++++++++++++++ 4 files changed, 85 insertions(+), 26 deletions(-) diff --git a/encodings/sequence/src/compress.rs b/encodings/sequence/src/compress.rs index f34cf4ffe34..11554bfb255 100644 --- a/encodings/sequence/src/compress.rs +++ b/encodings/sequence/src/compress.rs @@ -24,6 +24,8 @@ use vortex_error::VortexResult; use crate::Sequence; use crate::SequenceArray; use crate::SequenceData; +use crate::SequenceDataParts; + /// An iterator that yields `base, base + step, base + 2*step, ...` via repeated addition. struct SequenceIter { acc: T, @@ -91,6 +93,22 @@ pub fn sequence_encode( primitive_array: ArrayView<'_, Primitive>, ctx: &mut ExecutionCtx, ) -> VortexResult> { + let nullability = primitive_array.dtype().nullability(); + let len = primitive_array.len(); + let Some(parts) = sequence_parts(primitive_array, ctx)? else { + return Ok(None); + }; + + Sequence::try_new(parts.base, parts.multiplier, parts.ptype, nullability, len) + .map(|a| Some(a.into_array())) +} + +/// Returns the sequence base and multiplier if a primitive array can be represented exactly as a +/// [`SequenceArray`]. +pub fn sequence_parts( + primitive_array: ArrayView<'_, Primitive>, + ctx: &mut ExecutionCtx, +) -> VortexResult> { if primitive_array.is_empty() { // we cannot encode an empty array return Ok(None); @@ -106,22 +124,22 @@ pub fn sequence_encode( } match_each_integer_ptype!(primitive_array.ptype(), |P| { - encode_primitive_array( - primitive_array.as_slice::

(), - primitive_array.dtype().nullability(), - ) + sequence_parts_typed(primitive_array.as_slice::

()) }) } -fn encode_primitive_array + CheckedAdd + CheckedSub>( +fn sequence_parts_typed + CheckedAdd + CheckedSub>( slice: &[P], - nullability: Nullability, -) -> VortexResult> { +) -> VortexResult> { if slice.len() == 1 { // The multiplier here can be any value, zero is chosen - return Sequence::try_new_typed(slice[0], P::zero(), nullability, 1) - .map(|a| Some(a.into_array())); + return Ok(Some(SequenceDataParts { + base: slice[0].into(), + multiplier: P::zero().into(), + ptype: P::PTYPE, + })); } + let base = slice[0]; let Some(multiplier) = slice[1].checked_sub(&base) else { return Ok(None); @@ -136,14 +154,18 @@ fn encode_primitive_array + CheckedAdd + CheckedSu return Ok(None); } - slice + if !slice .windows(2) .all(|w| Some(w[1]) == w[0].checked_add(&multiplier)) - .then_some( - Sequence::try_new_typed(base, multiplier, nullability, slice.len()) - .map(|a| a.into_array()), - ) - .transpose() + { + return Ok(None); + } + + Ok(Some(SequenceDataParts { + base: base.into(), + multiplier: multiplier.into(), + ptype: P::PTYPE, + })) } #[cfg(test)] diff --git a/encodings/sequence/src/lib.rs b/encodings/sequence/src/lib.rs index b898963c346..a8194326f05 100644 --- a/encodings/sequence/src/lib.rs +++ b/encodings/sequence/src/lib.rs @@ -16,6 +16,7 @@ pub use array::SequenceArray; pub use array::SequenceData; pub use array::SequenceDataParts; pub use compress::sequence_encode; +pub use compress::sequence_parts; use vortex_array::ArrayVTable; use vortex_array::aggregate_fn::AggregateFnVTable; use vortex_array::aggregate_fn::fns::is_sorted::IsSorted; diff --git a/vortex-btrblocks/src/schemes/integer/sequence.rs b/vortex-btrblocks/src/schemes/integer/sequence.rs index 604d7567d00..bc66cbfbafb 100644 --- a/vortex-btrblocks/src/schemes/integer/sequence.rs +++ b/vortex-btrblocks/src/schemes/integer/sequence.rs @@ -20,6 +20,7 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_sequence::sequence_encode; +use vortex_sequence::sequence_parts; use crate::ArrayAndStats; use crate::CascadingCompressor; @@ -82,17 +83,10 @@ impl Scheme for SequenceScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - // If the distinct_values_count was computed, and not all values are unique, then this - // cannot be encoded as a sequence array. - if stats - .distinct_count() - .is_some_and(|count| count as usize != data.array_len()) - { + if !stats.estimated_distinct_count_could_equal(data.array_len()) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - // TODO(connor): `sequence_encode` allocates the encoded array just to confirm feasibility. - // A cheaper `is_sequence` probe would let us skip the allocation entirely. CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( |_compressor, data, best_so_far, _ctx, exec_ctx| { // `SequenceArray` stores exactly two scalars (base and multiplier), so the best @@ -107,9 +101,7 @@ impl Scheme for SequenceScheme { return Ok(EstimateVerdict::Skip); } - // TODO(connor): We should pass this array back to the compressor in the case that - // we do want to sequence encode this so that we do not need to recompress. - if sequence_encode(data.array_as_primitive(), exec_ctx)?.is_none() { + if sequence_parts(data.array_as_primitive(), exec_ctx)?.is_none() { return Ok(EstimateVerdict::Skip); } // TODO(connor): Should we get the actual ratio here? diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index 204bb5ad97f..034eb06c044 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -23,6 +23,13 @@ use vortex_mask::AllOr; use super::GenerateStatsOptions; use super::cardinality::CardinalityEstimator; +/// Expected relative error for the default cardinality estimator precision. +/// +/// Cloudflare's default `P=12` HLL++ parameters document this as `1.04 / sqrt(2^12)`. +const DISTINCT_COUNT_ERROR_NUMERATOR: usize = 65; +/// Denominator for the default cardinality estimator expected relative error. +const DISTINCT_COUNT_ERROR_DENOMINATOR: usize = 4_000; + /// Information about the distinct values in an integer array. /// /// The `distinct_count` is an estimate computed using Cloudflare's cardinality estimator, which @@ -265,12 +272,30 @@ impl IntegerStats { self.erased.distinct_count() } + /// Returns true if the estimated distinct count could equal `count` within the estimator's + /// expected error bound. + pub fn estimated_distinct_count_could_equal(&self, count: usize) -> bool { + let Some(distinct_count) = self.distinct_count() else { + return true; + }; + + let error_bound = distinct_count_error_bound(count); + (distinct_count as usize).abs_diff(count) <= error_bound + } + /// Get the most commonly occurring value and its count, if we have computed it already. pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { self.erased.most_frequent_value_and_count() } } +/// Returns the absolute error bound for an expected distinct count. +fn distinct_count_error_bound(count: usize) -> usize { + count + .saturating_mul(DISTINCT_COUNT_ERROR_NUMERATOR) + .div_ceil(DISTINCT_COUNT_ERROR_DENOMINATOR) +} + impl IntegerStats { /// Generates stats with default options. pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self { @@ -721,4 +746,23 @@ mod tests { ); Ok(()) } + + #[test] + fn test_estimated_distinct_count_could_equal() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + + let unique = + PrimitiveArray::new((0..1024u32).collect::>(), Validity::NonNullable); + let unique_stats = typed_int_stats::(&unique, true, &mut ctx)?; + assert!(unique_stats.estimated_distinct_count_could_equal(1024)); + + let low_cardinality = PrimitiveArray::new( + (0..1024u32).map(|value| value % 8).collect::>(), + Validity::NonNullable, + ); + let low_cardinality_stats = typed_int_stats::(&low_cardinality, true, &mut ctx)?; + assert!(!low_cardinality_stats.estimated_distinct_count_could_equal(1024)); + + Ok(()) + } } From 839c4c68469e0dddc18fb5eb9feb051429d97d44 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 10:18:16 +0100 Subject: [PATCH 03/16] more Signed-off-by: Robert Kruszewski --- .../schemes/integer/scheme_selection_tests.rs | 21 +++++++ vortex-compressor/src/stats/integer.rs | 62 +++++++++++++++---- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs index e4227a472ec..f19c02340eb 100644 --- a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs +++ b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs @@ -21,6 +21,7 @@ use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_fastlanes::BitPacked; +use vortex_fastlanes::Delta; use vortex_fastlanes::FoR; use vortex_runend::RunEnd; use vortex_sequence::Sequence; @@ -136,6 +137,26 @@ fn test_sequence_compressed() -> VortexResult<()> { Ok(()) } +/// Regression for the cardinality-estimator change: a 10k-element `arange` is all-distinct, but +/// the HLL++ estimate drifts a few percent off the exact count. The sequence gate must still treat +/// it as a candidate so it is encoded as a sequence (or delta), matching `test_arrange_encode` in +/// the Python suite. +#[test] +fn test_arange_sequence_compressed() -> VortexResult<()> { + let array = PrimitiveArray::new( + (0..10_000u32).collect::>(), + Validity::NonNullable, + ); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array(), &mut SESSION.create_execution_ctx())?; + assert!( + compressed.is::() || compressed.is::(), + "expected Sequence or Delta, got {}", + compressed.encoding_id() + ); + Ok(()) +} + #[test] fn test_rle_compressed() -> VortexResult<()> { let mut values: Vec = Vec::new(); diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index 034eb06c044..abc2bd4c9bc 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -23,12 +23,18 @@ use vortex_mask::AllOr; use super::GenerateStatsOptions; use super::cardinality::CardinalityEstimator; -/// Expected relative error for the default cardinality estimator precision. +/// Expected relative standard error of the default cardinality estimator. /// -/// Cloudflare's default `P=12` HLL++ parameters document this as `1.04 / sqrt(2^12)`. -const DISTINCT_COUNT_ERROR_NUMERATOR: usize = 65; -/// Denominator for the default cardinality estimator expected relative error. -const DISTINCT_COUNT_ERROR_DENOMINATOR: usize = 4_000; +/// Cloudflare's default `P=12` HLL++ parameters document this as `1.04 / sqrt(2^12) ≈ 1.625%`. +const DISTINCT_COUNT_STD_ERROR_NUMERATOR: usize = 65; +/// Denominator for [`DISTINCT_COUNT_STD_ERROR_NUMERATOR`]. +const DISTINCT_COUNT_STD_ERROR_DENOMINATOR: usize = 4_000; +/// Number of standard errors to tolerate when deciding whether an estimate is consistent with an +/// exact count. A single standard error only covers ~68% of estimates, which spuriously rejects +/// genuinely all-distinct arrays (e.g. a true arithmetic sequence whose estimate drifts a few +/// percent off the array length). A wider interval keeps the false-rejection rate negligible while +/// still excluding clearly low-cardinality data, whose estimate is far below the array length. +const DISTINCT_COUNT_CONFIDENCE_SIGMAS: usize = 4; /// Information about the distinct values in an integer array. /// @@ -272,15 +278,26 @@ impl IntegerStats { self.erased.distinct_count() } - /// Returns true if the estimated distinct count could equal `count` within the estimator's - /// expected error bound. + /// Returns true if the true distinct count could plausibly equal `count`. + /// + /// Callers pass an upper bound on the true distinct count (the array length), so an estimate + /// at or above `count` is pure estimator overshoot and cannot rule out "every value is + /// distinct" — we accept it. Below `count` we only rule it out when the estimate falls short by + /// more than the estimator's expected error (see [`distinct_count_error_bound`]), so estimator + /// noise does not cause us to skip genuinely all-distinct arrays such as arithmetic sequences. pub fn estimated_distinct_count_could_equal(&self, count: usize) -> bool { let Some(distinct_count) = self.distinct_count() else { return true; }; + let distinct_count = distinct_count as usize; + + // The true distinct count never exceeds `count`, so an over-estimate reflects estimator + // noise and remains consistent with all values being distinct. + if distinct_count >= count { + return true; + } - let error_bound = distinct_count_error_bound(count); - (distinct_count as usize).abs_diff(count) <= error_bound + count - distinct_count <= distinct_count_error_bound(count) } /// Get the most commonly occurring value and its count, if we have computed it already. @@ -289,11 +306,14 @@ impl IntegerStats { } } -/// Returns the absolute error bound for an expected distinct count. +/// Returns the absolute tolerance for treating an estimate as consistent with `count` distinct +/// values, sized as [`DISTINCT_COUNT_CONFIDENCE_SIGMAS`] multiples of the estimator's standard +/// error. fn distinct_count_error_bound(count: usize) -> usize { count - .saturating_mul(DISTINCT_COUNT_ERROR_NUMERATOR) - .div_ceil(DISTINCT_COUNT_ERROR_DENOMINATOR) + .saturating_mul(DISTINCT_COUNT_STD_ERROR_NUMERATOR) + .saturating_mul(DISTINCT_COUNT_CONFIDENCE_SIGMAS) + .div_ceil(DISTINCT_COUNT_STD_ERROR_DENOMINATOR) } impl IntegerStats { @@ -765,4 +785,22 @@ mod tests { Ok(()) } + + #[test] + fn test_all_distinct_could_equal_despite_estimator_drift() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + + // Regression: an arithmetic sequence is all-distinct, but the HLL++ estimate for ~10k + // values drifts a few percent off the exact count (here it overshoots to ~10457). The + // sequence scheme gates on `estimated_distinct_count_could_equal`, so this must remain true + // or the array is never sequence-encoded. + let arange = PrimitiveArray::new( + (0..10_000u32).collect::>(), + Validity::NonNullable, + ); + let arange_stats = typed_int_stats::(&arange, true, &mut ctx)?; + assert!(arange_stats.estimated_distinct_count_could_equal(10_000)); + + Ok(()) + } } From d72356eccf52aa72aade0f799647a8de885e1626 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 10:21:10 +0100 Subject: [PATCH 04/16] typos Signed-off-by: Robert Kruszewski --- vortex-compressor/src/stats/integer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index abc2bd4c9bc..9b631fd1763 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -794,12 +794,12 @@ mod tests { // values drifts a few percent off the exact count (here it overshoots to ~10457). The // sequence scheme gates on `estimated_distinct_count_could_equal`, so this must remain true // or the array is never sequence-encoded. - let arange = PrimitiveArray::new( + let range = PrimitiveArray::new( (0..10_000u32).collect::>(), Validity::NonNullable, ); - let arange_stats = typed_int_stats::(&arange, true, &mut ctx)?; - assert!(arange_stats.estimated_distinct_count_could_equal(10_000)); + let range_stats = typed_int_stats::(&range, true, &mut ctx)?; + assert!(range_stats.estimated_distinct_count_could_equal(10_000)); Ok(()) } From f004698d5032e049cc6a7c75523e73190b625106 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 10:21:48 +0100 Subject: [PATCH 05/16] more Signed-off-by: Robert Kruszewski --- .../src/schemes/integer/scheme_selection_tests.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs index f19c02340eb..77d69c62b92 100644 --- a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs +++ b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs @@ -137,12 +137,8 @@ fn test_sequence_compressed() -> VortexResult<()> { Ok(()) } -/// Regression for the cardinality-estimator change: a 10k-element `arange` is all-distinct, but -/// the HLL++ estimate drifts a few percent off the exact count. The sequence gate must still treat -/// it as a candidate so it is encoded as a sequence (or delta), matching `test_arrange_encode` in -/// the Python suite. #[test] -fn test_arange_sequence_compressed() -> VortexResult<()> { +fn test_range_sequence_compressed() -> VortexResult<()> { let array = PrimitiveArray::new( (0..10_000u32).collect::>(), Validity::NonNullable, From dd659060a5055f5088a680ea588309ed7f6e9c43 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 14:00:39 +0100 Subject: [PATCH 06/16] docs Signed-off-by: Robert Kruszewski --- vortex-compressor/src/stats/integer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index 9b631fd1763..e20d29fe737 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -283,8 +283,8 @@ impl IntegerStats { /// Callers pass an upper bound on the true distinct count (the array length), so an estimate /// at or above `count` is pure estimator overshoot and cannot rule out "every value is /// distinct" — we accept it. Below `count` we only rule it out when the estimate falls short by - /// more than the estimator's expected error (see [`distinct_count_error_bound`]), so estimator - /// noise does not cause us to skip genuinely all-distinct arrays such as arithmetic sequences. + /// more than the estimator's expected error, so estimator noise does not cause us to skip + /// genuinely all-distinct arrays such as arithmetic sequences. pub fn estimated_distinct_count_could_equal(&self, count: usize) -> bool { let Some(distinct_count) = self.distinct_count() else { return true; From 6ec21767142fdbb7dd026f608cd473bb98bb6e89 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 14:34:12 +0100 Subject: [PATCH 07/16] more Signed-off-by: Robert Kruszewski --- encodings/sequence/src/compress.rs | 2 - vortex-btrblocks/src/schemes/float/rle.rs | 4 +- vortex-btrblocks/src/schemes/integer/mod.rs | 4 +- vortex-btrblocks/src/schemes/integer/rle.rs | 4 +- .../src/schemes/integer/runend.rs | 6 +-- .../src/schemes/integer/sparse.rs | 2 +- .../src/builtins/constant/float.rs | 10 ++-- .../src/builtins/constant/integer.rs | 10 ++-- .../src/builtins/dict/integer.rs | 8 +-- vortex-compressor/src/stats/float.rs | 27 ++++------ vortex-compressor/src/stats/integer.rs | 53 +++++++++---------- 11 files changed, 59 insertions(+), 71 deletions(-) diff --git a/encodings/sequence/src/compress.rs b/encodings/sequence/src/compress.rs index 11554bfb255..750bd95c38c 100644 --- a/encodings/sequence/src/compress.rs +++ b/encodings/sequence/src/compress.rs @@ -172,8 +172,6 @@ fn sequence_parts_typed + CheckedAdd + CheckedSub> mod tests { use std::sync::LazyLock; - #[expect(unused_imports)] - use itertools::Itertools; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; diff --git a/vortex-btrblocks/src/schemes/float/rle.rs b/vortex-btrblocks/src/schemes/float/rle.rs index 54406fcb9fd..96f1b728e11 100644 --- a/vortex-btrblocks/src/schemes/float/rle.rs +++ b/vortex-btrblocks/src/schemes/float/rle.rs @@ -17,7 +17,7 @@ use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; -use crate::schemes::integer::RUN_LENGTH_THRESHOLD; +use crate::schemes::integer::RUN_THRESHOLD; use crate::schemes::integer::rle_compress; use crate::schemes::rle_ancestor_exclusions; use crate::schemes::rle_descendant_exclusions; @@ -59,7 +59,7 @@ impl Scheme for FloatRLEScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - if data.float_stats(exec_ctx).average_run_length() < RUN_LENGTH_THRESHOLD { + if data.float_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/mod.rs b/vortex-btrblocks/src/schemes/integer/mod.rs index abe5868f5c8..e31cd91374d 100644 --- a/vortex-btrblocks/src/schemes/integer/mod.rs +++ b/vortex-btrblocks/src/schemes/integer/mod.rs @@ -35,8 +35,8 @@ pub use vortex_compressor::builtins::IntDictScheme; pub use vortex_compressor::stats::IntegerStats; pub use zigzag::ZigZagScheme; -/// Threshold for the average run length in an array before we consider run-length encoding. -pub(crate) const RUN_LENGTH_THRESHOLD: u32 = 4; +/// Threshold for the average run length in an array before we consider any run encoding. +pub(crate) const RUN_THRESHOLD: usize = 4; #[cfg(test)] mod scheme_selection_tests; diff --git a/vortex-btrblocks/src/schemes/integer/rle.rs b/vortex-btrblocks/src/schemes/integer/rle.rs index df0f0183780..40d55fea9ac 100644 --- a/vortex-btrblocks/src/schemes/integer/rle.rs +++ b/vortex-btrblocks/src/schemes/integer/rle.rs @@ -22,12 +22,12 @@ use vortex_fastlanes::Delta; use vortex_fastlanes::RLE; use vortex_fastlanes::RLEArrayExt; -use super::RUN_LENGTH_THRESHOLD; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; +use crate::schemes::integer::RUN_THRESHOLD; use crate::schemes::rle_ancestor_exclusions; use crate::schemes::rle_descendant_exclusions; @@ -179,7 +179,7 @@ impl Scheme for IntRLEScheme { if compress_ctx.finished_cascading() { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - if data.integer_stats(exec_ctx).average_run_length() < RUN_LENGTH_THRESHOLD { + if data.integer_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/runend.rs b/vortex-btrblocks/src/schemes/integer/runend.rs index b68567d789c..78efb77bc9b 100644 --- a/vortex-btrblocks/src/schemes/integer/runend.rs +++ b/vortex-btrblocks/src/schemes/integer/runend.rs @@ -23,6 +23,7 @@ use vortex_runend::RunEnd; use vortex_runend::compress::runend_encode; use super::IntRLEScheme; +use super::RUN_THRESHOLD; use super::SparseScheme; use crate::ArrayAndStats; use crate::CascadingCompressor; @@ -30,9 +31,6 @@ use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; -/// Threshold for the average run length in an array before we consider run-end encoding. -const RUN_END_THRESHOLD: u32 = 4; - /// Run-end encoding with end positions. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; @@ -104,7 +102,7 @@ impl Scheme for RunEndScheme { exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { // If the run length is below the threshold, drop it. - if data.integer_stats(exec_ctx).average_run_length() < RUN_END_THRESHOLD { + if data.integer_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/sparse.rs b/vortex-btrblocks/src/schemes/integer/sparse.rs index 609a258d495..0ae6defb97a 100644 --- a/vortex-btrblocks/src/schemes/integer/sparse.rs +++ b/vortex-btrblocks/src/schemes/integer/sparse.rs @@ -140,7 +140,7 @@ impl Scheme for SparseScheme { "this must be present since `SparseScheme` declared that we need distinct values", ); - if most_frequent_count as usize == len { + if most_frequent_count == len { // If the most frequent value is the only value, we should compress as constant instead. return Ok(ConstantArray::new( Scalar::primitive_value( diff --git a/vortex-compressor/src/builtins/constant/float.rs b/vortex-compressor/src/builtins/constant/float.rs index 0480a1b7a53..9658c807acc 100644 --- a/vortex-compressor/src/builtins/constant/float.rs +++ b/vortex-compressor/src/builtins/constant/float.rs @@ -48,17 +48,17 @@ impl Scheme for FloatConstantScheme { // Note that we only compute distinct counts if other schemes have requested it. if let Some(distinct_count) = stats.distinct_count() { - if distinct_count > 1 { - return CompressionEstimate::Verdict(EstimateVerdict::Skip); + return if distinct_count > 1 { + CompressionEstimate::Verdict(EstimateVerdict::Skip) } else { debug_assert_eq!(distinct_count, 1); - return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); - } + CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse) + }; } // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count() as usize, array_len); + debug_assert_eq!(stats.null_count(), array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } diff --git a/vortex-compressor/src/builtins/constant/integer.rs b/vortex-compressor/src/builtins/constant/integer.rs index 3f324c36e17..77311684a40 100644 --- a/vortex-compressor/src/builtins/constant/integer.rs +++ b/vortex-compressor/src/builtins/constant/integer.rs @@ -46,17 +46,17 @@ impl Scheme for IntConstantScheme { // Note that we only compute distinct counts if other schemes have requested it. if let Some(distinct_count) = stats.distinct_count() { - if distinct_count > 1 { - return CompressionEstimate::Verdict(EstimateVerdict::Skip); + return if distinct_count > 1 { + CompressionEstimate::Verdict(EstimateVerdict::Skip) } else { debug_assert_eq!(distinct_count, 1); - return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); - } + CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse) + }; } // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count() as usize, array_len); + debug_assert_eq!(stats.null_count(), array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index 10387756be8..0b6066a3a13 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -76,21 +76,21 @@ impl Scheme for IntDictScheme { // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = bit_width * distinct_values_count as usize; + let values_size = bit_width * distinct_values_count; // TODO(connor): Should we just hardcode this instead of let the compressor choose? // Assume codes are compressed RLE + BitPacking. let codes_bw = u32::BITS - distinct_values_count.leading_zeros(); - let n_runs = (stats.value_count() / stats.average_run_length()) as usize; + let n_runs = stats.value_count() / stats.average_run_length(); // Assume that codes will either be BitPack or RLE-BitPack. - let codes_size_bp = codes_bw as usize * stats.value_count() as usize; + let codes_size_bp = codes_bw as usize * stats.value_count(); let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs); let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - let before = stats.value_count() as usize * bit_width; + let before = stats.value_count() * bit_width; CompressionEstimate::Verdict(EstimateVerdict::Ratio( before as f64 / (values_size + codes_size) as f64, diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index 26f3c22d503..d003abca9e7 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -30,7 +30,7 @@ use super::cardinality::CardinalityEstimator; #[derive(Debug, Clone)] pub struct DistinctInfo { /// The estimated count of unique values. This _must_ be non-zero. - distinct_count: u32, + distinct_count: usize, /// Phantom marker for the float element type. _marker: PhantomData, } @@ -62,7 +62,7 @@ pub enum ErasedStats { impl ErasedStats { /// Get the count of distinct values, if we have computed it already. - fn distinct_count(&self) -> Option { + fn distinct_count(&self) -> Option { match self { ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count), ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count), @@ -90,11 +90,11 @@ impl_from_typed!(f64, ErasedStats::F64); #[derive(Debug, Clone)] pub struct FloatStats { /// Cache for `validity.false_count()`. - null_count: u32, + null_count: usize, /// Cache for `validity.true_count()`. - value_count: u32, + value_count: usize, /// The average run length. - average_run_length: u32, + average_run_length: usize, /// Type-erased typed statistics. erased: ErasedStats, } @@ -115,7 +115,7 @@ impl FloatStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.erased.distinct_count() } } @@ -137,17 +137,17 @@ impl FloatStats { } /// Returns the number of null values. - pub fn null_count(&self) -> u32 { + pub fn null_count(&self) -> usize { self.null_count } /// Returns the number of non-null values. - pub fn value_count(&self) -> u32 { + pub fn value_count(&self) -> usize { self.value_count } /// Returns the average run length. - pub fn average_run_length(&self) -> u32 { + pub fn average_run_length(&self) -> usize { self.average_run_length } @@ -179,7 +179,7 @@ where if array.all_invalid(ctx)? { return Ok(FloatStats { - null_count: u32::try_from(array.len())?, + null_count: array.len(), value_count: 0, average_run_length: 0, erased: TypedStats { @@ -248,13 +248,8 @@ where } } - let null_count = u32::try_from(null_count)?; - let value_count = u32::try_from(value_count)?; - let distinct = count_distinct_values.then(|| DistinctInfo { - distinct_count: u32::try_from(estimator.estimate()) - .vortex_expect("more than u32::MAX distinct values") - .max(1), + distinct_count: estimator.estimate().max(1), _marker: PhantomData, }); diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index e20d29fe737..ee7cf7671b2 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -47,11 +47,11 @@ const DISTINCT_COUNT_CONFIDENCE_SIGMAS: usize = 4; #[derive(Debug, Clone)] pub struct DistinctInfo { /// The estimated count of unique values. This _must_ be non-zero. - distinct_count: u32, + distinct_count: usize, /// The most frequent value (Boyer-Moore majority candidate). most_frequent_value: T, /// The exact number of times `most_frequent_value` occurs in the array. - top_frequency: u32, + top_frequency: usize, } /// Typed statistics for a specific integer type. @@ -74,12 +74,12 @@ impl TypedStats { impl TypedStats { /// Get the count of distinct values, if we have computed it already. - fn distinct_count(&self) -> Option { + fn distinct_count(&self) -> Option { Some(self.distinct.as_ref()?.distinct_count) } /// Get the most commonly occurring value and its count, if we have computed it already. - fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> { + fn most_frequent_value_and_count(&self) -> Option<(&T, usize)> { let distinct = self.distinct.as_ref()?; Some((&distinct.most_frequent_value, distinct.top_frequency)) } @@ -176,7 +176,7 @@ impl ErasedStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { match &self { ErasedStats::U8(x) => x.distinct_count(), ErasedStats::U16(x) => x.distinct_count(), @@ -190,7 +190,7 @@ impl ErasedStats { } /// Get the most commonly occurring value and its count. - pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, usize)> { match &self { ErasedStats::U8(x) => { let (top_value, count) = x.most_frequent_value_and_count()?; @@ -252,11 +252,11 @@ impl_from_typed!(i64, ErasedStats::I64); #[derive(Clone, Debug)] pub struct IntegerStats { /// Cache for `validity.false_count()`. - null_count: u32, + null_count: usize, /// Cache for `validity.true_count()`. - value_count: u32, + value_count: usize, /// The average run length. - average_run_length: u32, + average_run_length: usize, /// Type-erased typed statistics. erased: ErasedStats, } @@ -274,7 +274,7 @@ impl IntegerStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.erased.distinct_count() } @@ -289,7 +289,6 @@ impl IntegerStats { let Some(distinct_count) = self.distinct_count() else { return true; }; - let distinct_count = distinct_count as usize; // The true distinct count never exceeds `count`, so an over-estimate reflects estimator // noise and remains consistent with all values being distinct. @@ -301,7 +300,7 @@ impl IntegerStats { } /// Get the most commonly occurring value and its count, if we have computed it already. - pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, usize)> { self.erased.most_frequent_value_and_count() } } @@ -333,17 +332,17 @@ impl IntegerStats { } /// Returns the number of null values. - pub fn null_count(&self) -> u32 { + pub fn null_count(&self) -> usize { self.null_count } /// Returns the number of non-null values. - pub fn value_count(&self) -> u32 { + pub fn value_count(&self) -> usize { self.value_count } /// Returns the average run length. - pub fn average_run_length(&self) -> u32 { + pub fn average_run_length(&self) -> usize { self.average_run_length } @@ -381,7 +380,7 @@ where if array.all_invalid(ctx)? { return Ok(IntegerStats { - null_count: u32::try_from(array.len())?, + null_count: array.len(), value_count: 0, average_run_length: 0, erased: TypedStats { @@ -491,9 +490,7 @@ where let distinct = count_distinct_values.then(|| { // The cardinality estimator is exact for small cardinalities and approximate beyond. // We clamp to at least 1 because we are inside the non-empty/non-all-null branch. - let distinct_count = u32::try_from(loop_state.estimator.estimate()) - .vortex_expect("there are more than `u32::MAX` distinct values") - .max(1); + let distinct_count = loop_state.estimator.estimate().max(1); // Count the Boyer-Moore majority candidate exactly via a second pass. If any value // accounts for more than half of the non-null entries, this counts that value; otherwise @@ -513,9 +510,6 @@ where let typed = TypedStats { min, max, distinct }; - let null_count = u32::try_from(null_count)?; - let value_count = u32::try_from(value_count)?; - Ok(IntegerStats { null_count, value_count, @@ -533,14 +527,14 @@ where /// The previous value seen. prev: T, /// The run count. - runs: u32, + runs: usize, /// Cloudflare's cardinality estimator, used to approximate the number of distinct values /// without materializing an exact hash map. estimator: CardinalityEstimator>, /// Boyer-Moore majority candidate; holds the current candidate for the most frequent value. bm_candidate: T, /// Boyer-Moore vote counter for `bm_candidate`. - bm_votes: u32, + bm_votes: usize, } /// Updates the Boyer-Moore majority-vote state for a single value. @@ -562,8 +556,12 @@ where /// Counts exact occurrences of `needle` in `buffer`, restricted to valid positions according to /// `validity`. -fn count_occurrences(buffer: &[T], validity: AllOr<&BitBuffer>, needle: T) -> u32 { - let count = match validity { +fn count_occurrences( + buffer: &[T], + validity: AllOr<&BitBuffer>, + needle: T, +) -> usize { + match validity { AllOr::All => buffer.iter().filter(|&&v| v == needle).count(), AllOr::None => 0, AllOr::Some(mask) => buffer @@ -571,8 +569,7 @@ fn count_occurrences(buffer: &[T], validity: AllOr<&BitBuffer>, .enumerate() .filter(|&(idx, &v)| mask.value(idx) && v == needle) .count(), - }; - u32::try_from(count).vortex_expect("occurrences cannot exceed `u32::MAX`") + } } /// Inner loop for non-null chunks of 64 values. From 7ef762cb274d24d996ae0f7d502a0306c964baf6 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 3 Jun 2026 14:46:25 +0100 Subject: [PATCH 08/16] fix Signed-off-by: Robert Kruszewski --- vortex-compressor/src/builtins/dict/integer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index 0b6066a3a13..7457e4ababa 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -80,7 +80,7 @@ impl Scheme for IntDictScheme { // TODO(connor): Should we just hardcode this instead of let the compressor choose? // Assume codes are compressed RLE + BitPacking. - let codes_bw = u32::BITS - distinct_values_count.leading_zeros(); + let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); let n_runs = stats.value_count() / stats.average_run_length(); From 20060f9d6b3e95361f3ba5e803eda8e20e8a02d9 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 4 Jun 2026 15:30:27 +0100 Subject: [PATCH 09/16] more Signed-off-by: Robert Kruszewski --- vortex-array/src/builders/dict/bytes.rs | 125 ++++++++++++------ vortex-array/src/builders/dict/primitive.rs | 43 +++--- .../src/builtins/constant/binary.rs | 5 +- vortex-compressor/src/builtins/dict/binary.rs | 11 +- vortex-compressor/src/builtins/dict/float.rs | 11 +- .../src/builtins/dict/integer.rs | 8 +- vortex-compressor/src/builtins/dict/string.rs | 11 +- vortex-compressor/src/stats/cardinality.rs | 23 ++++ vortex-compressor/src/stats/float.rs | 10 ++ vortex-compressor/src/stats/integer.rs | 34 ++--- vortex-compressor/src/stats/varbinview.rs | 10 ++ 11 files changed, 193 insertions(+), 98 deletions(-) diff --git a/vortex-array/src/builders/dict/bytes.rs b/vortex-array/src/builders/dict/bytes.rs index 3a045b5c21c..d2e5455d7b7 100644 --- a/vortex-array/src/builders/dict/bytes.rs +++ b/vortex-array/src/builders/dict/bytes.rs @@ -6,11 +6,11 @@ use std::hash::BuildHasher; use std::mem; use std::sync::Arc; -use itertools::Itertools; use num_traits::AsPrimitive; use vortex_array::ExecutionCtx; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -131,26 +131,49 @@ impl BytesDictBuilder { } } - /// Encode a stream of value bytes against the dictionary, honoring the supplied validity mask. + fn encode_null(&mut self) -> Option { + if let Some(code) = self.null_code.get() { + return Some(*code); + } + + if self.views.len() >= self.max_dict_len + || self.dict_bytes() + size_of::() > self.max_dict_bytes + { + return None; + } + + let code = self.views.len(); + self.views.push(BinaryView::default()); + self.values_nulls.append_false(); + let code = Code::from_usize(code) + .unwrap_or_else(|| vortex_panic!("{} has to fit into {}", code, Code::PTYPE)); + self.null_code + .set(code) + .ok() + .vortex_expect("null code is initialized once"); + Some(code) + } + + /// Encode row values against the dictionary, honoring the supplied validity mask. /// - /// `values` must yield one slice per logical row in input order; the mask is applied here so - /// callers do not need to emit anything for null positions. - fn encode_iter<'a, I>( + /// `value_at` is called only for valid rows. That matters for VarBinView arrays because null + /// rows can hold arbitrary view metadata. + fn encode_validity<'a, F>( &mut self, len: usize, validity_mask: Mask, - values: I, + mut value_at: F, ) -> VortexResult where - I: Iterator, + F: FnMut(usize) -> &'a [u8], { let mut local_lookup = self.lookup.take().vortex_expect("Must have a lookup dict"); let mut codes: BufferMut = BufferMut::with_capacity(len); match validity_mask.bit_buffer() { AllOr::All => { - for value in values { - let Some(code) = self.encode_value(&mut local_lookup, value) else { + for idx in 0..len { + let Some(code) = self.encode_value(&mut local_lookup, value_at(idx)) else { break; }; // SAFETY: we reserved capacity in the buffer for `len` elements @@ -158,27 +181,20 @@ impl BytesDictBuilder { } } AllOr::None => { - self.views.push(BinaryView::default()); - self.values_nulls.append_false(); - unsafe { - codes.push_n_unchecked(Code::from_usize(0).vortex_expect("must fit 0"), len) + if let Some(code) = self.encode_null() { + unsafe { codes.push_n_unchecked(code, len) } } } AllOr::Some(b) => { - for (value, valid) in values.zip_eq(b.iter()) { + for (idx, valid) in b.iter().enumerate() { if !valid { - let code = self.null_code.get_or_init(|| { - let code = self.views.len(); - self.views.push(BinaryView::default()); - self.values_nulls.append_false(); - Code::from_usize(code).unwrap_or_else(|| { - vortex_panic!("{} has to fit into {}", code, Code::PTYPE) - }) - }); + let Some(code) = self.encode_null() else { + break; + }; // SAFETY: we reserved capacity in the buffer for `len` elements - unsafe { codes.push_unchecked(*code) } + unsafe { codes.push_unchecked(code) } } else { - let Some(code) = self.encode_value(&mut local_lookup, value) else { + let Some(code) = self.encode_value(&mut local_lookup, value_at(idx)) else { break; }; // SAFETY: we reserved capacity in the buffer for `len` elements @@ -206,12 +222,11 @@ impl BytesDictBuilder { match_each_integer_ptype!(offsets.ptype(), |P| { let slice_offsets = offsets.as_slice::

(); - let values = slice_offsets.windows(2).map(|w| { - let start: usize = w[0].as_(); - let end: usize = w[1].as_(); + self.encode_validity(len, validity_mask, |idx| { + let start: usize = slice_offsets[idx].as_(); + let end: usize = slice_offsets[idx + 1].as_(); &bytes[start..end] - }); - self.encode_iter(len, validity_mask, values) + }) }) } @@ -231,14 +246,16 @@ impl BytesDictBuilder { .map(|b| b.as_host()) .collect::>(); - let values = views.iter().map(|view| { - if view.is_inlined() { - view.as_inlined().value() - } else { - &buffers[view.as_view().buffer_index as usize][view.as_view().as_range()] - } - }); - self.encode_iter(len, validity_mask, values) + self.encode_validity(len, validity_mask, |idx| view_bytes(&buffers, &views[idx])) + } +} + +fn view_bytes<'a>(buffers: &[&'a ByteBuffer], view: &'a BinaryView) -> &'a [u8] { + if view.is_inlined() { + view.as_inlined().value() + } else { + let view = view.as_view(); + &buffers[view.buffer_index as usize][view.as_range()] } } @@ -290,8 +307,11 @@ impl DictEncoder for BytesDictBuilder { #[cfg(test)] mod test { use std::str; + use std::sync::Arc; use std::sync::LazyLock; + use vortex_buffer::Buffer; + use vortex_buffer::ByteBuffer; use vortex_session::VortexSession; use crate::IntoArray; @@ -301,7 +321,12 @@ mod test { use crate::arrays::VarBinArray; use crate::arrays::VarBinViewArray; use crate::arrays::dict::DictArraySlotsExt; + use crate::arrays::varbinview::BinaryView; + use crate::buffer::BufferHandle; use crate::builders::dict::dict_encode; + use crate::dtype::DType; + use crate::dtype::Nullability; + use crate::validity::Validity; static SESSION: LazyLock = LazyLock::new(crate::array_session); @@ -365,6 +390,32 @@ mod test { }); } + #[test] + fn encode_varbinview_ignores_invalid_null_views() { + let value = b"outlined value"; + let valid_view = BinaryView::make_view(value, 0, 0); + let invalid_null_view = BinaryView::make_view(b"invalid null view", 99, 0); + let views = Buffer::copy_from([valid_view, invalid_null_view, valid_view]); + let buffers = Arc::from([BufferHandle::new_host(ByteBuffer::copy_from(value))]); + let arr = unsafe { + VarBinViewArray::new_handle_unchecked( + BufferHandle::new_host(views.into_byte_buffer()), + buffers, + DType::Utf8(Nullability::Nullable), + Validity::from_iter([true, false, true]), + ) + } + .into_array(); + + let dict = dict_encode(&arr, &mut SESSION.create_execution_ctx()).unwrap(); + let codes = dict + .codes() + .clone() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); + assert_eq!(codes.as_slice::(), &[0, 1, 0]); + } + #[test] fn repeated_values() { let arr = VarBinArray::from(vec!["a", "a", "b", "b", "a", "b", "a", "b"]); diff --git a/vortex-array/src/builders/dict/primitive.rs b/vortex-array/src/builders/dict/primitive.rs index 3d80cd4723f..e8d36d128f2 100644 --- a/vortex-array/src/builders/dict/primitive.rs +++ b/vortex-array/src/builders/dict/primitive.rs @@ -98,6 +98,27 @@ where } } } + + fn encode_null(&mut self) -> Option { + if let Some(code) = self.null_code.get() { + return Some(*code); + } + + if self.values.len() >= self.max_dict_len { + return None; + } + + let code = Code::from_usize(self.values.len()).unwrap_or_else(|| { + vortex_panic!("{} has to fit into {}", self.values.len(), Code::PTYPE) + }); + self.values.push(T::default()); + self.values_nulls.append_false(); + self.null_code + .set(code) + .ok() + .vortex_expect("null code is initialized once"); + Some(code) + } } /// Dictionary encode primitive array with given PType. @@ -132,28 +153,18 @@ where } } Mask::AllFalse(_) => { - self.values.push(T::default()); - self.values_nulls.append_false(); - unsafe { - codes.push_n_unchecked( - Code::from_usize(0).vortex_expect("must fit 0"), - array.len(), - ) + if let Some(code) = self.encode_null() { + unsafe { codes.push_n_unchecked(code, array.len()) } } } Mask::Values(v) => { let bit_buff = v.bit_buffer(); for (&value, valid) in prim.as_slice::().iter().zip(bit_buff) { if !valid { - let code = self.null_code.get_or_init(|| { - let code = self.values.len(); - self.values.push(T::default()); - self.values_nulls.append_false(); - Code::from_usize(code).unwrap_or_else(|| { - vortex_panic!("{} has to fit into {}", code, Code::PTYPE) - }) - }); - unsafe { codes.push_unchecked(*code) } + let Some(code) = self.encode_null() else { + break; + }; + unsafe { codes.push_unchecked(code) } } else { let Some(code) = self.encode_value(value) else { break; diff --git a/vortex-compressor/src/builtins/constant/binary.rs b/vortex-compressor/src/builtins/constant/binary.rs index 6b1395ed0fd..c5d84ef463d 100644 --- a/vortex-compressor/src/builtins/constant/binary.rs +++ b/vortex-compressor/src/builtins/constant/binary.rs @@ -52,8 +52,9 @@ impl Scheme for BinaryConstantScheme { return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } - // Since the estimated distinct count is always going to be less than or equal to the actual - // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. + // The estimator is exact for small cardinalities, so a constant array (exactly one + // distinct value) is always reported as 1. An estimate above 1 therefore means the array + // is genuinely not constant; an estimate of 1 still falls through to the exact check below. if stats.estimated_distinct_count().is_some_and(|c| c > 1) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/binary.rs b/vortex-compressor/src/builtins/dict/binary.rs index bd194aa1121..fe8345e6502 100644 --- a/vortex-compressor/src/builtins/dict/binary.rs +++ b/vortex-compressor/src/builtins/dict/binary.rs @@ -16,7 +16,6 @@ use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::builders::dict::dict_encode; -use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -80,12 +79,12 @@ impl Scheme for BinaryDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); + let max_useful_distinct_count = stats.value_count() / 2; - // If > 50% of the values are distinct, skip dictionary scheme. - if estimated_distinct_values_count > stats.value_count() / 2 { + // If more than 50% of the values are definitely distinct, skip dictionary encoding. + // Cardinality is approximate for larger counts, so keep borderline estimates alive long + // enough for sampling to decide. + if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs index 1af27466e67..67836c39afb 100644 --- a/vortex-compressor/src/builtins/dict/float.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -16,7 +16,6 @@ use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::builders::dict::dict_encode; -use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -89,12 +88,12 @@ impl Scheme for FloatDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let distinct_values_count = stats.distinct_count().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); + let max_useful_distinct_count = stats.value_count() / 2; - // If > 50% of the values are distinct, skip dictionary scheme. - if distinct_values_count > stats.value_count() / 2 { + // If more than 50% of the values are definitely distinct, skip dictionary encoding. + // Cardinality is approximate for larger counts, so keep borderline estimates alive long + // enough for sampling to decide. + if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index 7457e4ababa..f9292dce167 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -69,8 +69,12 @@ impl Scheme for IntDictScheme { "this must be present since `DictScheme` declared that we need distinct values", ); - // If > 50% of the values are distinct, skip dictionary scheme. - if distinct_values_count > stats.value_count() / 2 { + let max_useful_distinct_count = stats.value_count() / 2; + + // If more than 50% of the values are definitely distinct, skip dictionary encoding. + // Cardinality is approximate for larger counts, so keep borderline estimates alive long + // enough for sampling to decide. + if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/string.rs b/vortex-compressor/src/builtins/dict/string.rs index 72db72c2cc5..978050be970 100644 --- a/vortex-compressor/src/builtins/dict/string.rs +++ b/vortex-compressor/src/builtins/dict/string.rs @@ -16,7 +16,6 @@ use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::builders::dict::dict_encode; -use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -80,12 +79,12 @@ impl Scheme for StringDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); + let max_useful_distinct_count = stats.value_count() / 2; - // If > 50% of the values are distinct, skip dictionary scheme. - if estimated_distinct_values_count > stats.value_count() / 2 { + // If more than 50% of the values are definitely distinct, skip dictionary encoding. + // Cardinality is approximate for larger counts, so keep borderline estimates alive long + // enough for sampling to decide. + if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/stats/cardinality.rs b/vortex-compressor/src/stats/cardinality.rs index a17dc40d018..d3e50c14f7a 100644 --- a/vortex-compressor/src/stats/cardinality.rs +++ b/vortex-compressor/src/stats/cardinality.rs @@ -14,6 +14,16 @@ use std::hash::Hash; +/// Expected relative standard error of the default cardinality estimator. +/// +/// Cloudflare's default `P=12` HLL++ parameters document this as +/// `1.04 / sqrt(2^12) ≈ 1.625%`. +const DISTINCT_COUNT_STD_ERROR_NUMERATOR: usize = 65; +/// Denominator for [`DISTINCT_COUNT_STD_ERROR_NUMERATOR`]. +const DISTINCT_COUNT_STD_ERROR_DENOMINATOR: usize = 4_000; +/// Number of standard errors to tolerate when turning estimates into scheme-selection cutoffs. +const DISTINCT_COUNT_CONFIDENCE_SIGMAS: usize = 4; + /// Approximate distinct-count estimator with a tiny, stable surface area. /// /// The estimator is exact for small cardinalities on 64-bit targets and approximate beyond. On @@ -56,6 +66,19 @@ impl Default for CardinalityEstimator { } } +/// Returns an absolute tolerance for comparing a cardinality estimate with `count`. +pub(crate) fn distinct_count_error_bound(count: usize) -> usize { + count + .saturating_mul(DISTINCT_COUNT_STD_ERROR_NUMERATOR) + .saturating_mul(DISTINCT_COUNT_CONFIDENCE_SIGMAS) + .div_ceil(DISTINCT_COUNT_STD_ERROR_DENOMINATOR) +} + +/// Returns true if `estimate` is still consistent with the true count being at most `count`. +pub(crate) fn estimate_could_be_at_most(estimate: usize, count: usize) -> bool { + estimate <= count || estimate - count <= distinct_count_error_bound(count) +} + /// Backend implementations selected at compile time by pointer width. #[cfg(target_pointer_width = "64")] mod inner { diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index d003abca9e7..3196b95eb12 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -22,6 +22,7 @@ use vortex_mask::AllOr; use super::GenerateStatsOptions; use super::cardinality::CardinalityEstimator; +use super::cardinality::estimate_could_be_at_most; /// Information about the distinct values in a float array. /// @@ -118,6 +119,15 @@ impl FloatStats { pub fn distinct_count(&self) -> Option { self.erased.distinct_count() } + + /// Returns true if the true distinct count could plausibly be at most `count`. + pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { + let Some(distinct_count) = self.distinct_count() else { + return true; + }; + + estimate_could_be_at_most(distinct_count, count) + } } impl FloatStats { diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index ee7cf7671b2..90666cf5eab 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -22,19 +22,8 @@ use vortex_mask::AllOr; use super::GenerateStatsOptions; use super::cardinality::CardinalityEstimator; - -/// Expected relative standard error of the default cardinality estimator. -/// -/// Cloudflare's default `P=12` HLL++ parameters document this as `1.04 / sqrt(2^12) ≈ 1.625%`. -const DISTINCT_COUNT_STD_ERROR_NUMERATOR: usize = 65; -/// Denominator for [`DISTINCT_COUNT_STD_ERROR_NUMERATOR`]. -const DISTINCT_COUNT_STD_ERROR_DENOMINATOR: usize = 4_000; -/// Number of standard errors to tolerate when deciding whether an estimate is consistent with an -/// exact count. A single standard error only covers ~68% of estimates, which spuriously rejects -/// genuinely all-distinct arrays (e.g. a true arithmetic sequence whose estimate drifts a few -/// percent off the array length). A wider interval keeps the false-rejection rate negligible while -/// still excluding clearly low-cardinality data, whose estimate is far below the array length. -const DISTINCT_COUNT_CONFIDENCE_SIGMAS: usize = 4; +use super::cardinality::distinct_count_error_bound; +use super::cardinality::estimate_could_be_at_most; /// Information about the distinct values in an integer array. /// @@ -299,22 +288,21 @@ impl IntegerStats { count - distinct_count <= distinct_count_error_bound(count) } + /// Returns true if the true distinct count could plausibly be at most `count`. + pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { + let Some(distinct_count) = self.distinct_count() else { + return true; + }; + + estimate_could_be_at_most(distinct_count, count) + } + /// Get the most commonly occurring value and its count, if we have computed it already. pub fn most_frequent_value_and_count(&self) -> Option<(PValue, usize)> { self.erased.most_frequent_value_and_count() } } -/// Returns the absolute tolerance for treating an estimate as consistent with `count` distinct -/// values, sized as [`DISTINCT_COUNT_CONFIDENCE_SIGMAS`] multiples of the estimator's standard -/// error. -fn distinct_count_error_bound(count: usize) -> usize { - count - .saturating_mul(DISTINCT_COUNT_STD_ERROR_NUMERATOR) - .saturating_mul(DISTINCT_COUNT_CONFIDENCE_SIGMAS) - .div_ceil(DISTINCT_COUNT_STD_ERROR_DENOMINATOR) -} - impl IntegerStats { /// Generates stats with default options. pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self { diff --git a/vortex-compressor/src/stats/varbinview.rs b/vortex-compressor/src/stats/varbinview.rs index 6ea747914c8..47c5b8c12d2 100644 --- a/vortex-compressor/src/stats/varbinview.rs +++ b/vortex-compressor/src/stats/varbinview.rs @@ -14,6 +14,7 @@ use vortex_mask::AllOr; use super::GenerateStatsOptions; use super::cardinality::CardinalityEstimator; +use super::cardinality::estimate_could_be_at_most; /// Array of variable-length byte/string values, and relevant stats for compression. #[derive(Clone, Debug)] @@ -132,6 +133,15 @@ impl StringStats { self.estimated_distinct_count } + /// Returns true if the true distinct count could plausibly be at most `count`. + pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { + let Some(distinct_count) = self.estimated_distinct_count else { + return true; + }; + + estimate_could_be_at_most(distinct_count, count) + } + /// Returns the number of non-null values. pub fn value_count(&self) -> usize { self.value_count From 53617069b9cbae10edcc1117d63906d32b3174ae Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Fri, 19 Jun 2026 14:38:48 +0100 Subject: [PATCH 10/16] less Signed-off-by: Robert Kruszewski --- Cargo.lock | 24 +- Cargo.toml | 2 - encodings/sequence/src/compress.rs | 52 +--- encodings/sequence/src/lib.rs | 1 - vortex-btrblocks/src/schemes/float/rle.rs | 4 +- vortex-btrblocks/src/schemes/integer/mod.rs | 4 +- vortex-btrblocks/src/schemes/integer/rle.rs | 4 +- .../src/schemes/integer/runend.rs | 6 +- .../schemes/integer/scheme_selection_tests.rs | 17 -- .../src/schemes/integer/sequence.rs | 14 +- .../src/schemes/integer/sparse.rs | 2 +- vortex-compressor/Cargo.toml | 9 - vortex-compressor/benches/dict_encode.rs | 3 +- .../src/builtins/constant/binary.rs | 7 +- .../src/builtins/constant/float.rs | 10 +- .../src/builtins/constant/integer.rs | 10 +- .../src/builtins/constant/string.rs | 7 +- vortex-compressor/src/builtins/dict/binary.rs | 11 +- vortex-compressor/src/builtins/dict/float.rs | 142 ++++++++- .../src/builtins/dict/integer.rs | 162 +++++++++-- vortex-compressor/src/builtins/dict/mod.rs | 2 + vortex-compressor/src/builtins/dict/string.rs | 11 +- vortex-compressor/src/builtins/mod.rs | 2 + vortex-compressor/src/stats/bool.rs | 16 +- vortex-compressor/src/stats/cardinality.rs | 169 ----------- vortex-compressor/src/stats/float.rs | 86 +++--- vortex-compressor/src/stats/integer.rs | 274 ++++-------------- vortex-compressor/src/stats/mod.rs | 1 - vortex-compressor/src/stats/varbinview.rs | 107 ++----- 29 files changed, 483 insertions(+), 676 deletions(-) delete mode 100644 vortex-compressor/src/stats/cardinality.rs diff --git a/Cargo.lock b/Cargo.lock index c83f5c50463..f36678b069c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1036,16 +1036,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "cardinality-estimator" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a7bc4e9a7ab9239a4a46df35d75101cee57defc8e4255a1b2b6f8eab8de87e" -dependencies = [ - "enum_dispatch", - "wyhash", -] - [[package]] name = "cargo-platform" version = "0.3.3" @@ -3602,6 +3592,7 @@ dependencies = [ ] [[package]] +<<<<<<< HEAD name = "enum_dispatch" version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3614,6 +3605,8 @@ dependencies = [ ] [[package]] +======= +>>>>>>> 1db45e55e (less) name = "env_filter" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -9726,9 +9719,7 @@ dependencies = [ name = "vortex-compressor" version = "0.1.0" dependencies = [ - "cardinality-estimator", "codspeed-divan-compat", - "hyperloglogplus", "itertools 0.14.0", "num-traits", "parking_lot", @@ -11126,15 +11117,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" -[[package]] -name = "wyhash" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf6e163c25e3fac820b4b453185ea2dea3b6a3e0a721d4d23d75bd33734c295" -dependencies = [ - "rand_core 0.6.4", -] - [[package]] name = "wyz" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index 36c9b76713b..0a1cd5731e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -119,7 +119,6 @@ bit-vec = "0.9.0" bitvec = "1.0.1" bytes = "1.11.1" bzip2 = "0.6.0" -cardinality-estimator = "1.0.3" cargo_metadata = "0.23.1" cbindgen = "0.29.0" cc = "1.2" @@ -169,7 +168,6 @@ goldenfile = "1" half = { version = "2.7.1", features = ["std", "num-traits"] } hashbrown = "0.17.1" humansize = "2.1.3" -hyperloglogplus = "0.4.1" indicatif = "0.18.0" insta = "1.43" inventory = "0.3.20" diff --git a/encodings/sequence/src/compress.rs b/encodings/sequence/src/compress.rs index 750bd95c38c..b20980cf132 100644 --- a/encodings/sequence/src/compress.rs +++ b/encodings/sequence/src/compress.rs @@ -24,8 +24,6 @@ use vortex_error::VortexResult; use crate::Sequence; use crate::SequenceArray; use crate::SequenceData; -use crate::SequenceDataParts; - /// An iterator that yields `base, base + step, base + 2*step, ...` via repeated addition. struct SequenceIter { acc: T, @@ -93,22 +91,6 @@ pub fn sequence_encode( primitive_array: ArrayView<'_, Primitive>, ctx: &mut ExecutionCtx, ) -> VortexResult> { - let nullability = primitive_array.dtype().nullability(); - let len = primitive_array.len(); - let Some(parts) = sequence_parts(primitive_array, ctx)? else { - return Ok(None); - }; - - Sequence::try_new(parts.base, parts.multiplier, parts.ptype, nullability, len) - .map(|a| Some(a.into_array())) -} - -/// Returns the sequence base and multiplier if a primitive array can be represented exactly as a -/// [`SequenceArray`]. -pub fn sequence_parts( - primitive_array: ArrayView<'_, Primitive>, - ctx: &mut ExecutionCtx, -) -> VortexResult> { if primitive_array.is_empty() { // we cannot encode an empty array return Ok(None); @@ -124,22 +106,22 @@ pub fn sequence_parts( } match_each_integer_ptype!(primitive_array.ptype(), |P| { - sequence_parts_typed(primitive_array.as_slice::

()) + encode_primitive_array( + primitive_array.as_slice::

(), + primitive_array.dtype().nullability(), + ) }) } -fn sequence_parts_typed + CheckedAdd + CheckedSub>( +fn encode_primitive_array + CheckedAdd + CheckedSub>( slice: &[P], -) -> VortexResult> { + nullability: Nullability, +) -> VortexResult> { if slice.len() == 1 { // The multiplier here can be any value, zero is chosen - return Ok(Some(SequenceDataParts { - base: slice[0].into(), - multiplier: P::zero().into(), - ptype: P::PTYPE, - })); + return Sequence::try_new_typed(slice[0], P::zero(), nullability, 1) + .map(|a| Some(a.into_array())); } - let base = slice[0]; let Some(multiplier) = slice[1].checked_sub(&base) else { return Ok(None); @@ -154,18 +136,14 @@ fn sequence_parts_typed + CheckedAdd + CheckedSub> return Ok(None); } - if !slice + slice .windows(2) .all(|w| Some(w[1]) == w[0].checked_add(&multiplier)) - { - return Ok(None); - } - - Ok(Some(SequenceDataParts { - base: base.into(), - multiplier: multiplier.into(), - ptype: P::PTYPE, - })) + .then_some( + Sequence::try_new_typed(base, multiplier, nullability, slice.len()) + .map(|a| a.into_array()), + ) + .transpose() } #[cfg(test)] diff --git a/encodings/sequence/src/lib.rs b/encodings/sequence/src/lib.rs index a8194326f05..b898963c346 100644 --- a/encodings/sequence/src/lib.rs +++ b/encodings/sequence/src/lib.rs @@ -16,7 +16,6 @@ pub use array::SequenceArray; pub use array::SequenceData; pub use array::SequenceDataParts; pub use compress::sequence_encode; -pub use compress::sequence_parts; use vortex_array::ArrayVTable; use vortex_array::aggregate_fn::AggregateFnVTable; use vortex_array::aggregate_fn::fns::is_sorted::IsSorted; diff --git a/vortex-btrblocks/src/schemes/float/rle.rs b/vortex-btrblocks/src/schemes/float/rle.rs index 96f1b728e11..54406fcb9fd 100644 --- a/vortex-btrblocks/src/schemes/float/rle.rs +++ b/vortex-btrblocks/src/schemes/float/rle.rs @@ -17,7 +17,7 @@ use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; -use crate::schemes::integer::RUN_THRESHOLD; +use crate::schemes::integer::RUN_LENGTH_THRESHOLD; use crate::schemes::integer::rle_compress; use crate::schemes::rle_ancestor_exclusions; use crate::schemes::rle_descendant_exclusions; @@ -59,7 +59,7 @@ impl Scheme for FloatRLEScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - if data.float_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { + if data.float_stats(exec_ctx).average_run_length() < RUN_LENGTH_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/mod.rs b/vortex-btrblocks/src/schemes/integer/mod.rs index e31cd91374d..abe5868f5c8 100644 --- a/vortex-btrblocks/src/schemes/integer/mod.rs +++ b/vortex-btrblocks/src/schemes/integer/mod.rs @@ -35,8 +35,8 @@ pub use vortex_compressor::builtins::IntDictScheme; pub use vortex_compressor::stats::IntegerStats; pub use zigzag::ZigZagScheme; -/// Threshold for the average run length in an array before we consider any run encoding. -pub(crate) const RUN_THRESHOLD: usize = 4; +/// Threshold for the average run length in an array before we consider run-length encoding. +pub(crate) const RUN_LENGTH_THRESHOLD: u32 = 4; #[cfg(test)] mod scheme_selection_tests; diff --git a/vortex-btrblocks/src/schemes/integer/rle.rs b/vortex-btrblocks/src/schemes/integer/rle.rs index 40d55fea9ac..df0f0183780 100644 --- a/vortex-btrblocks/src/schemes/integer/rle.rs +++ b/vortex-btrblocks/src/schemes/integer/rle.rs @@ -22,12 +22,12 @@ use vortex_fastlanes::Delta; use vortex_fastlanes::RLE; use vortex_fastlanes::RLEArrayExt; +use super::RUN_LENGTH_THRESHOLD; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; -use crate::schemes::integer::RUN_THRESHOLD; use crate::schemes::rle_ancestor_exclusions; use crate::schemes::rle_descendant_exclusions; @@ -179,7 +179,7 @@ impl Scheme for IntRLEScheme { if compress_ctx.finished_cascading() { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - if data.integer_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { + if data.integer_stats(exec_ctx).average_run_length() < RUN_LENGTH_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/runend.rs b/vortex-btrblocks/src/schemes/integer/runend.rs index 78efb77bc9b..b68567d789c 100644 --- a/vortex-btrblocks/src/schemes/integer/runend.rs +++ b/vortex-btrblocks/src/schemes/integer/runend.rs @@ -23,7 +23,6 @@ use vortex_runend::RunEnd; use vortex_runend::compress::runend_encode; use super::IntRLEScheme; -use super::RUN_THRESHOLD; use super::SparseScheme; use crate::ArrayAndStats; use crate::CascadingCompressor; @@ -31,6 +30,9 @@ use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; +/// Threshold for the average run length in an array before we consider run-end encoding. +const RUN_END_THRESHOLD: u32 = 4; + /// Run-end encoding with end positions. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; @@ -102,7 +104,7 @@ impl Scheme for RunEndScheme { exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { // If the run length is below the threshold, drop it. - if data.integer_stats(exec_ctx).average_run_length() < RUN_THRESHOLD { + if data.integer_stats(exec_ctx).average_run_length() < RUN_END_THRESHOLD { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs index 77d69c62b92..e4227a472ec 100644 --- a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs +++ b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs @@ -21,7 +21,6 @@ use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_fastlanes::BitPacked; -use vortex_fastlanes::Delta; use vortex_fastlanes::FoR; use vortex_runend::RunEnd; use vortex_sequence::Sequence; @@ -137,22 +136,6 @@ fn test_sequence_compressed() -> VortexResult<()> { Ok(()) } -#[test] -fn test_range_sequence_compressed() -> VortexResult<()> { - let array = PrimitiveArray::new( - (0..10_000u32).collect::>(), - Validity::NonNullable, - ); - let btr = BtrBlocksCompressor::default(); - let compressed = btr.compress(&array.into_array(), &mut SESSION.create_execution_ctx())?; - assert!( - compressed.is::() || compressed.is::(), - "expected Sequence or Delta, got {}", - compressed.encoding_id() - ); - Ok(()) -} - #[test] fn test_rle_compressed() -> VortexResult<()> { let mut values: Vec = Vec::new(); diff --git a/vortex-btrblocks/src/schemes/integer/sequence.rs b/vortex-btrblocks/src/schemes/integer/sequence.rs index bc66cbfbafb..604d7567d00 100644 --- a/vortex-btrblocks/src/schemes/integer/sequence.rs +++ b/vortex-btrblocks/src/schemes/integer/sequence.rs @@ -20,7 +20,6 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_sequence::sequence_encode; -use vortex_sequence::sequence_parts; use crate::ArrayAndStats; use crate::CascadingCompressor; @@ -83,10 +82,17 @@ impl Scheme for SequenceScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - if !stats.estimated_distinct_count_could_equal(data.array_len()) { + // If the distinct_values_count was computed, and not all values are unique, then this + // cannot be encoded as a sequence array. + if stats + .distinct_count() + .is_some_and(|count| count as usize != data.array_len()) + { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } + // TODO(connor): `sequence_encode` allocates the encoded array just to confirm feasibility. + // A cheaper `is_sequence` probe would let us skip the allocation entirely. CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( |_compressor, data, best_so_far, _ctx, exec_ctx| { // `SequenceArray` stores exactly two scalars (base and multiplier), so the best @@ -101,7 +107,9 @@ impl Scheme for SequenceScheme { return Ok(EstimateVerdict::Skip); } - if sequence_parts(data.array_as_primitive(), exec_ctx)?.is_none() { + // TODO(connor): We should pass this array back to the compressor in the case that + // we do want to sequence encode this so that we do not need to recompress. + if sequence_encode(data.array_as_primitive(), exec_ctx)?.is_none() { return Ok(EstimateVerdict::Skip); } // TODO(connor): Should we get the actual ratio here? diff --git a/vortex-btrblocks/src/schemes/integer/sparse.rs b/vortex-btrblocks/src/schemes/integer/sparse.rs index 0ae6defb97a..609a258d495 100644 --- a/vortex-btrblocks/src/schemes/integer/sparse.rs +++ b/vortex-btrblocks/src/schemes/integer/sparse.rs @@ -140,7 +140,7 @@ impl Scheme for SparseScheme { "this must be present since `SparseScheme` declared that we need distinct values", ); - if most_frequent_count == len { + if most_frequent_count as usize == len { // If the most frequent value is the only value, we should compress as constant instead. return Ok(ConstantArray::new( Scalar::primitive_value( diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml index 317ddeca61b..2d28d86a5e6 100644 --- a/vortex-compressor/Cargo.toml +++ b/vortex-compressor/Cargo.toml @@ -26,15 +26,6 @@ vortex-error = { workspace = true } vortex-mask = { workspace = true } vortex-utils = { workspace = true } -# Cloudflare's `cardinality-estimator` uses a tagged-pointer representation gated on a 64-bit -# pointer width, so it cannot compile on 32-bit targets. On those platforms we fall back to the -# pure `hyperloglogplus` crate; the two backends are unified by the wrapper in `stats::cardinality`. -[target.'cfg(target_pointer_width = "64")'.dependencies] -cardinality-estimator = { workspace = true } - -[target.'cfg(not(target_pointer_width = "64"))'.dependencies] -hyperloglogplus = { workspace = true } - [dev-dependencies] divan = { workspace = true } rstest = { workspace = true } diff --git a/vortex-compressor/benches/dict_encode.rs b/vortex-compressor/benches/dict_encode.rs index 2c4e24108a7..c3d3d2e3617 100644 --- a/vortex-compressor/benches/dict_encode.rs +++ b/vortex-compressor/benches/dict_encode.rs @@ -3,10 +3,9 @@ #![expect(clippy::unwrap_used)] -use std::sync::LazyLock; - use divan::Bencher; use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; diff --git a/vortex-compressor/src/builtins/constant/binary.rs b/vortex-compressor/src/builtins/constant/binary.rs index c5d84ef463d..4ad24fe57b5 100644 --- a/vortex-compressor/src/builtins/constant/binary.rs +++ b/vortex-compressor/src/builtins/constant/binary.rs @@ -48,13 +48,12 @@ impl Scheme for BinaryConstantScheme { // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count(), array_len); + debug_assert_eq!(stats.null_count() as usize, array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } - // The estimator is exact for small cardinalities, so a constant array (exactly one - // distinct value) is always reported as 1. An estimate above 1 therefore means the array - // is genuinely not constant; an estimate of 1 still falls through to the exact check below. + // Since the estimated distinct count is always going to be less than or equal to the actual + // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. if stats.estimated_distinct_count().is_some_and(|c| c > 1) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/constant/float.rs b/vortex-compressor/src/builtins/constant/float.rs index 9658c807acc..0480a1b7a53 100644 --- a/vortex-compressor/src/builtins/constant/float.rs +++ b/vortex-compressor/src/builtins/constant/float.rs @@ -48,17 +48,17 @@ impl Scheme for FloatConstantScheme { // Note that we only compute distinct counts if other schemes have requested it. if let Some(distinct_count) = stats.distinct_count() { - return if distinct_count > 1 { - CompressionEstimate::Verdict(EstimateVerdict::Skip) + if distinct_count > 1 { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); } else { debug_assert_eq!(distinct_count, 1); - CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse) - }; + return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); + } } // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count(), array_len); + debug_assert_eq!(stats.null_count() as usize, array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } diff --git a/vortex-compressor/src/builtins/constant/integer.rs b/vortex-compressor/src/builtins/constant/integer.rs index 77311684a40..3f324c36e17 100644 --- a/vortex-compressor/src/builtins/constant/integer.rs +++ b/vortex-compressor/src/builtins/constant/integer.rs @@ -46,17 +46,17 @@ impl Scheme for IntConstantScheme { // Note that we only compute distinct counts if other schemes have requested it. if let Some(distinct_count) = stats.distinct_count() { - return if distinct_count > 1 { - CompressionEstimate::Verdict(EstimateVerdict::Skip) + if distinct_count > 1 { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); } else { debug_assert_eq!(distinct_count, 1); - CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse) - }; + return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); + } } // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count(), array_len); + debug_assert_eq!(stats.null_count() as usize, array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } diff --git a/vortex-compressor/src/builtins/constant/string.rs b/vortex-compressor/src/builtins/constant/string.rs index da6b8419327..f55c1661660 100644 --- a/vortex-compressor/src/builtins/constant/string.rs +++ b/vortex-compressor/src/builtins/constant/string.rs @@ -48,13 +48,12 @@ impl Scheme for StringConstantScheme { // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { - debug_assert_eq!(stats.null_count(), array_len); + debug_assert_eq!(stats.null_count() as usize, array_len); return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); } - // The estimator is exact for small cardinalities, so a constant array (exactly one - // distinct value) is always reported as 1. An estimate above 1 therefore means the array - // is genuinely not constant; an estimate of 1 still falls through to the exact check below. + // Since the estimated distinct count is always going to be less than or equal to the actual + // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. if stats.estimated_distinct_count().is_some_and(|c| c > 1) { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/binary.rs b/vortex-compressor/src/builtins/dict/binary.rs index fe8345e6502..bd194aa1121 100644 --- a/vortex-compressor/src/builtins/dict/binary.rs +++ b/vortex-compressor/src/builtins/dict/binary.rs @@ -16,6 +16,7 @@ use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -79,12 +80,12 @@ impl Scheme for BinaryDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let max_useful_distinct_count = stats.value_count() / 2; + let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); - // If more than 50% of the values are definitely distinct, skip dictionary encoding. - // Cardinality is approximate for larger counts, so keep borderline estimates alive long - // enough for sampling to decide. - if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { + // If > 50% of the values are distinct, skip dictionary scheme. + if estimated_distinct_values_count > stats.value_count() / 2 { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs index 67836c39afb..5288637a609 100644 --- a/vortex-compressor/src/builtins/dict/float.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -7,15 +7,20 @@ //! external compatibility. use vortex_array::ArrayRef; +use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::DictArray; +use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; -use vortex_array::builders::dict::dict_encode; +use vortex_array::dtype::half::f16; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -29,6 +34,8 @@ use crate::scheme::DescendantExclusion; use crate::scheme::Scheme; use crate::scheme::SchemeExt; use crate::stats::ArrayAndStats; +use crate::stats::FloatErasedStats; +use crate::stats::FloatStats; use crate::stats::GenerateStatsOptions; /// Dictionary encoding for low-cardinality float values. @@ -88,12 +95,12 @@ impl Scheme for FloatDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let max_useful_distinct_count = stats.value_count() / 2; + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); - // If more than 50% of the values are definitely distinct, skip dictionary encoding. - // Cardinality is approximate for larger counts, so keep borderline estimates alive long - // enough for sampling to decide. - if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { + // If > 50% of the values are distinct, skip dictionary scheme. + if distinct_values_count > stats.value_count() / 2 { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } @@ -108,7 +115,8 @@ impl Scheme for FloatDictScheme { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let dict = dict_encode(data.array(), exec_ctx)?; + let stats = data.float_stats(exec_ctx); + let dict = dictionary_encode(data.array_as_primitive(), &stats)?; let has_all_values_referenced = dict.has_all_values_referenced(); @@ -137,6 +145,106 @@ impl Scheme for FloatDictScheme { } } +/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values_validity = match $source_array.validity()? { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let codes_validity = $source_array.validity()?; + + let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + } else { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + }; + + let values = PrimitiveArray::new(values, values_validity).into_array(); + // SAFETY: enforced by the DictEncoder. + Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) }) + }}; +} + +/// Compresses a floating-point array into a dictionary array according to attached stats. +/// +/// # Errors +/// +/// Returns an error if unable to compute validity. +pub fn dictionary_encode( + array: ArrayView<'_, Primitive>, + stats: &FloatStats, +) -> VortexResult { + match stats.erased() { + FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16), + FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32), + FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for a float type using its bit representation as the hash key. +macro_rules! impl_encode { + ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; + ($typ:ty, $utyp:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[expect(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value.to_bits(), code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(f16, u16); +impl_encode!(f32, u32); +impl_encode!(f64, u64); + #[cfg(test)] mod tests { use vortex_array::IntoArray; @@ -145,25 +253,35 @@ mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::assert_arrays_eq; - use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; + use super::dictionary_encode; + use crate::stats::FloatStats; + use crate::stats::GenerateStatsOptions; + #[test] fn test_float_dict_encode() -> VortexResult<()> { let mut ctx = vortex_array::array_session().create_execution_ctx(); let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(values, validity).into_array(); + let array = PrimitiveArray::new(values, validity); - let dict_array = dict_encode(&array, &mut ctx)?; - assert_eq!(dict_array.values().len(), 3); + let stats = FloatStats::generate_opts( + &array, + GenerateStatsOptions { + count_distinct_values: true, + }, + &mut ctx, + ); + let dict_array = dictionary_encode(array.as_view(), &stats)?; + assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); let expected = PrimitiveArray::new( - buffer![1f32, 2f32, 2f32, 0f32, 1f32], + buffer![1f32, 2f32, 2f32, 1f32, 1f32], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index f9292dce167..8ec3bf53345 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -7,15 +7,18 @@ //! for external compatibility. use vortex_array::ArrayRef; +use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::DictArray; +use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; -use vortex_array::builders::dict::dict_encode; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -27,6 +30,8 @@ use crate::scheme::Scheme; use crate::scheme::SchemeExt; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; +use crate::stats::IntegerErasedStats; +use crate::stats::IntegerStats; /// Dictionary encoding for low-cardinality integer values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -69,32 +74,28 @@ impl Scheme for IntDictScheme { "this must be present since `DictScheme` declared that we need distinct values", ); - let max_useful_distinct_count = stats.value_count() / 2; - - // If more than 50% of the values are definitely distinct, skip dictionary encoding. - // Cardinality is approximate for larger counts, so keep borderline estimates alive long - // enough for sampling to decide. - if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { + // If > 50% of the values are distinct, skip dictionary scheme. + if distinct_values_count > stats.value_count() / 2 { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = bit_width * distinct_values_count; + let values_size = bit_width * distinct_values_count as usize; // TODO(connor): Should we just hardcode this instead of let the compressor choose? // Assume codes are compressed RLE + BitPacking. - let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); + let codes_bw = u32::BITS - distinct_values_count.leading_zeros(); - let n_runs = stats.value_count() / stats.average_run_length(); + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; // Assume that codes will either be BitPack or RLE-BitPack. - let codes_size_bp = codes_bw as usize * stats.value_count(); + let codes_size_bp = codes_bw as usize * stats.value_count() as usize; let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs); let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - let before = stats.value_count() * bit_width; + let before = stats.value_count() as usize * bit_width; CompressionEstimate::Verdict(EstimateVerdict::Ratio( before as f64 / (values_size + codes_size) as f64, @@ -108,7 +109,8 @@ impl Scheme for IntDictScheme { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let dict = dict_encode(data.array(), exec_ctx)?; + let stats = data.integer_stats(exec_ctx); + let dict = dictionary_encode(data.array_as_primitive(), &stats)?; // Values = child 0. let compressed_values = @@ -135,6 +137,121 @@ impl Scheme for IntDictScheme { } } +/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values_validity = match $source_array.validity()? { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let codes_validity = $source_array.validity()?; + + let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + } else { + let buf = >::encode( + &values, + $source_array.as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, codes_validity).into_array() + }; + + let values = PrimitiveArray::new(values, values_validity).into_array(); + // SAFETY: invariants enforced in DictEncoder. + Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) }) + }}; +} + +/// Compresses an integer array into a dictionary array according to attached stats. +/// +/// # Errors +/// +/// Returns an error if unable to compute validity. +#[expect( + clippy::cognitive_complexity, + reason = "complexity from match on all integer types" +)] +pub fn dictionary_encode( + array: ArrayView<'_, Primitive>, + stats: &IntegerStats, +) -> VortexResult { + match stats.erased() { + IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8), + IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16), + IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32), + IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64), + IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8), + IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16), + IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32), + IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32). +macro_rules! impl_encode { + ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; + ($typ:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[expect(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value, code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + // SAFETY: we have exactly sized output to be as large as values. + unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(u8); +impl_encode!(u16); +impl_encode!(u32); +impl_encode!(u64); +impl_encode!(i8); +impl_encode!(i16); +impl_encode!(i32); +impl_encode!(i64); + #[cfg(test)] mod tests { use vortex_array::IntoArray; @@ -143,25 +260,34 @@ mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::assert_arrays_eq; - use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; + use super::dictionary_encode; + use crate::stats::IntegerStats; + #[test] fn test_dict_encode_integer_stats() -> VortexResult<()> { let mut ctx = vortex_array::array_session().create_execution_ctx(); let data = buffer![100i32, 200, 100, 0, 100]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(data, validity).into_array(); + let array = PrimitiveArray::new(data, validity); - let dict_array = dict_encode(&array, &mut ctx)?; - assert_eq!(dict_array.values().len(), 3); + let stats = IntegerStats::generate_opts( + &array, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + &mut ctx, + ); + let dict_array = dictionary_encode(array.as_view(), &stats)?; + assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); let expected = PrimitiveArray::new( - buffer![100i32, 200, 100, 0, 100], + buffer![100i32, 200, 100, 100, 100], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index dd5fa0227ed..4862df2b211 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -10,5 +10,7 @@ mod string; pub use binary::BinaryDictScheme; pub use float::FloatDictScheme; +pub use float::dictionary_encode as float_dictionary_encode; pub use integer::IntDictScheme; +pub use integer::dictionary_encode as integer_dictionary_encode; pub use string::StringDictScheme; diff --git a/vortex-compressor/src/builtins/dict/string.rs b/vortex-compressor/src/builtins/dict/string.rs index 978050be970..72db72c2cc5 100644 --- a/vortex-compressor/src/builtins/dict/string.rs +++ b/vortex-compressor/src/builtins/dict/string.rs @@ -16,6 +16,7 @@ use vortex_array::arrays::dict::DictArrayExt; use vortex_array::arrays::dict::DictArraySlotsExt; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; @@ -79,12 +80,12 @@ impl Scheme for StringDictScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - let max_useful_distinct_count = stats.value_count() / 2; + let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); - // If more than 50% of the values are definitely distinct, skip dictionary encoding. - // Cardinality is approximate for larger counts, so keep borderline estimates alive long - // enough for sampling to decide. - if !stats.estimated_distinct_count_could_be_at_most(max_useful_distinct_count) { + // If > 50% of the values are distinct, skip dictionary scheme. + if estimated_distinct_values_count > stats.value_count() / 2 { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs index 499c2c6d6a7..d014f1a6cf3 100644 --- a/vortex-compressor/src/builtins/mod.rs +++ b/vortex-compressor/src/builtins/mod.rs @@ -16,6 +16,8 @@ pub use dict::BinaryDictScheme; pub use dict::FloatDictScheme; pub use dict::IntDictScheme; pub use dict::StringDictScheme; +pub use dict::float_dictionary_encode; +pub use dict::integer_dictionary_encode; mod constant; diff --git a/vortex-compressor/src/stats/bool.rs b/vortex-compressor/src/stats/bool.rs index 0a9931e240a..b9f2eec7b70 100644 --- a/vortex-compressor/src/stats/bool.rs +++ b/vortex-compressor/src/stats/bool.rs @@ -92,25 +92,19 @@ impl BoolStats { #[cfg(test)] mod tests { - use std::sync::LazyLock; - + use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::BoolArray; - use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_error::VortexResult; - use vortex_session::VortexSession; use super::BoolStats; - static SESSION: LazyLock = - LazyLock::new(|| VortexSession::empty().with::()); - #[test] fn test_all_true() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, true, true]), Validity::NonNullable, @@ -125,7 +119,7 @@ mod tests { #[test] fn test_all_false() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![false, false, false]), Validity::NonNullable, @@ -140,7 +134,7 @@ mod tests { #[test] fn test_mixed() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::NonNullable, @@ -155,7 +149,7 @@ mod tests { #[test] fn test_with_nulls() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::from_iter([true, false, true]), diff --git a/vortex-compressor/src/stats/cardinality.rs b/vortex-compressor/src/stats/cardinality.rs deleted file mode 100644 index d3e50c14f7a..00000000000 --- a/vortex-compressor/src/stats/cardinality.rs +++ /dev/null @@ -1,169 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Platform-portable cardinality estimator. -//! -//! On 64-bit targets this delegates to Cloudflare's -//! [`cardinality_estimator`](https://crates.io/crates/cardinality-estimator), which is exact up -//! to ~128 distinct values and then transitions to a HyperLogLog++ representation. That crate is -//! gated by `#[cfg(target_pointer_width = "64")]` because it stores its state in a tagged -//! `usize`, so it cannot compile on 32-bit platforms. To keep the compressor portable we fall -//! back to the pure-Rust [`hyperloglogplus`](https://crates.io/crates/hyperloglogplus) crate on -//! non-64-bit targets, using a HyperLogLog++ with sparse representation that gives comparable -//! quality within the standard HLL++ error bound (~1.6% at the default precision). - -use std::hash::Hash; - -/// Expected relative standard error of the default cardinality estimator. -/// -/// Cloudflare's default `P=12` HLL++ parameters document this as -/// `1.04 / sqrt(2^12) ≈ 1.625%`. -const DISTINCT_COUNT_STD_ERROR_NUMERATOR: usize = 65; -/// Denominator for [`DISTINCT_COUNT_STD_ERROR_NUMERATOR`]. -const DISTINCT_COUNT_STD_ERROR_DENOMINATOR: usize = 4_000; -/// Number of standard errors to tolerate when turning estimates into scheme-selection cutoffs. -const DISTINCT_COUNT_CONFIDENCE_SIGMAS: usize = 4; - -/// Approximate distinct-count estimator with a tiny, stable surface area. -/// -/// The estimator is exact for small cardinalities on 64-bit targets and approximate beyond. On -/// non-64-bit targets it is a HyperLogLog++ sketch throughout; collisions in the sparse -/// representation make small-cardinality estimates approximate as well, though the error stays -/// well within the standard HLL++ bound. -pub(crate) struct CardinalityEstimator { - /// Platform-selected backend (Cloudflare on 64-bit, HLL++ elsewhere). - inner: inner::Estimator, -} - -impl CardinalityEstimator { - /// Create a new estimator with the default precision. - #[inline] - pub(crate) fn new() -> Self { - Self { - inner: inner::Estimator::new(), - } - } - - /// Insert a hashable item into the estimator. - #[inline] - pub(crate) fn insert(&mut self, item: &T) { - self.inner.insert(item); - } - - /// Return the current cardinality estimate. - /// - /// Takes `&mut self` because the 32-bit fallback's `count` implementation mutates internal - /// caches; the 64-bit implementation is logically `&self` but is wrapped uniformly. - #[inline] - pub(crate) fn estimate(&mut self) -> usize { - self.inner.estimate() - } -} - -impl Default for CardinalityEstimator { - fn default() -> Self { - Self::new() - } -} - -/// Returns an absolute tolerance for comparing a cardinality estimate with `count`. -pub(crate) fn distinct_count_error_bound(count: usize) -> usize { - count - .saturating_mul(DISTINCT_COUNT_STD_ERROR_NUMERATOR) - .saturating_mul(DISTINCT_COUNT_CONFIDENCE_SIGMAS) - .div_ceil(DISTINCT_COUNT_STD_ERROR_DENOMINATOR) -} - -/// Returns true if `estimate` is still consistent with the true count being at most `count`. -pub(crate) fn estimate_could_be_at_most(estimate: usize, count: usize) -> bool { - estimate <= count || estimate - count <= distinct_count_error_bound(count) -} - -/// Backend implementations selected at compile time by pointer width. -#[cfg(target_pointer_width = "64")] -mod inner { - use std::hash::Hash; - - /// Thin wrapper around Cloudflare's tagged-pointer cardinality estimator. - pub(super) struct Estimator { - /// The Cloudflare estimator using its default `P=12, W=6` parameters. - inner: cardinality_estimator::CardinalityEstimator, - } - - impl Estimator { - /// Construct a fresh estimator at default precision. - #[inline] - pub(super) fn new() -> Self { - Self { - inner: cardinality_estimator::CardinalityEstimator::new(), - } - } - - /// Forward an insertion to the underlying estimator. - #[inline] - pub(super) fn insert(&mut self, item: &T) { - self.inner.insert(item); - } - - /// Forward to the underlying constant-time estimate. - #[inline] - pub(super) fn estimate(&mut self) -> usize { - self.inner.estimate() - } - } -} - -/// 32-bit fallback using the `hyperloglogplus` crate. -#[cfg(not(target_pointer_width = "64"))] -mod inner { - use std::hash::BuildHasherDefault; - use std::hash::Hash; - - use hyperloglogplus::HyperLogLog as _; - use hyperloglogplus::HyperLogLogPlus; - use rustc_hash::FxHasher; - use vortex_error::VortexExpect; - - /// HLL++ precision exponent: 2^12 = 4096 registers, matching Cloudflare's default (~1.6% error). - const PRECISION: u8 = 12; - - /// HyperLogLog++ sketch over `T`, hashed with a deterministic `FxHasher`. - pub(super) struct Estimator { - /// Backing HLL++ sketch. - inner: HyperLogLogPlus>, - } - - impl Estimator { - /// Construct a fresh estimator at the configured `PRECISION`. - #[inline] - pub(super) fn new() -> Self { - // `HyperLogLogPlus::new` only fails if precision is outside `[4, 18]`; 12 is a - // compile-time constant inside that range. - let inner = HyperLogLogPlus::new(PRECISION, BuildHasherDefault::default()) - .ok() - .vortex_expect("HyperLogLogPlus precision constant is in [4, 18]"); - Self { inner } - } - - /// Hash and absorb the item into the sketch. - #[inline] - pub(super) fn insert(&mut self, item: &T) { - self.inner.insert(item); - } - - /// Compute the current cardinality estimate. - #[inline] - pub(super) fn estimate(&mut self) -> usize { - let count = self.inner.count(); - // `count` returns a non-negative `f64`; round to the nearest integer and clamp to - // `usize` to match the 64-bit backend's return type. - #[expect( - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "HLL++ count is non-negative; truncation matches the 64-bit backend's usize semantics" - )] - let rounded = count.max(0.0).round() as usize; - rounded - } - } -} diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index 3196b95eb12..6c7e5360938 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -4,10 +4,10 @@ //! Float compression statistics. use std::hash::Hash; -use std::marker::PhantomData; use itertools::Itertools; use num_traits::Float; +use rustc_hash::FxBuildHasher; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; @@ -19,21 +19,24 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_error::vortex_panic; use vortex_mask::AllOr; +use vortex_utils::aliases::hash_set::HashSet; use super::GenerateStatsOptions; -use super::cardinality::CardinalityEstimator; -use super::cardinality::estimate_could_be_at_most; /// Information about the distinct values in a float array. -/// -/// The distinct count is an estimate produced by Cloudflare's cardinality estimator, which is -/// exact for small cardinalities and approximate beyond that. #[derive(Debug, Clone)] pub struct DistinctInfo { - /// The estimated count of unique values. This _must_ be non-zero. - distinct_count: usize, - /// Phantom marker for the float element type. - _marker: PhantomData, + /// The set of distinct float values. + distinct_values: HashSet, FxBuildHasher>, + /// The count of unique values. This _must_ be non-zero. + distinct_count: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values set. + pub fn distinct_values(&self) -> &HashSet, FxBuildHasher> { + &self.distinct_values + } } /// Typed statistics for a specific float type. @@ -63,7 +66,7 @@ pub enum ErasedStats { impl ErasedStats { /// Get the count of distinct values, if we have computed it already. - fn distinct_count(&self) -> Option { + fn distinct_count(&self) -> Option { match self { ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count), ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count), @@ -91,11 +94,11 @@ impl_from_typed!(f64, ErasedStats::F64); #[derive(Debug, Clone)] pub struct FloatStats { /// Cache for `validity.false_count()`. - null_count: usize, + null_count: u32, /// Cache for `validity.true_count()`. - value_count: usize, + value_count: u32, /// The average run length. - average_run_length: usize, + average_run_length: u32, /// Type-erased typed statistics. erased: ErasedStats, } @@ -116,18 +119,9 @@ impl FloatStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.erased.distinct_count() } - - /// Returns true if the true distinct count could plausibly be at most `count`. - pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { - let Some(distinct_count) = self.distinct_count() else { - return true; - }; - - estimate_could_be_at_most(distinct_count, count) - } } impl FloatStats { @@ -147,17 +141,17 @@ impl FloatStats { } /// Returns the number of null values. - pub fn null_count(&self) -> usize { + pub fn null_count(&self) -> u32 { self.null_count } /// Returns the number of non-null values. - pub fn value_count(&self) -> usize { + pub fn value_count(&self) -> u32 { self.value_count } /// Returns the average run length. - pub fn average_run_length(&self) -> usize { + pub fn average_run_length(&self) -> u32 { self.average_run_length } @@ -189,13 +183,13 @@ where if array.all_invalid(ctx)? { return Ok(FloatStats { - null_count: array.len(), + null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, erased: TypedStats { distinct: Some(DistinctInfo { + distinct_values: HashSet::with_capacity_and_hasher(0, FxBuildHasher), distinct_count: 0, - _marker: PhantomData, }), } .into(), @@ -208,9 +202,13 @@ where .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; let value_count = array.len() - null_count; - // Cloudflare's cardinality estimator gives us a bounded-memory approximation of the - // number of distinct values, replacing the previous exact `HashSet`. - let mut estimator: CardinalityEstimator> = CardinalityEstimator::new(); + // Keep a HashMap of T, then convert the keys into PValue afterward since value is + // so much more efficient to hash and search for. + let mut distinct_values = if count_distinct_values { + HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashSet::with_hasher(FxBuildHasher) + }; let validity = array .as_ref() @@ -229,7 +227,7 @@ where AllOr::All => { for value in first_valid_buff { if count_distinct_values { - estimator.insert(&NativeValue(value)); + distinct_values.insert(NativeValue(value)); } if value != prev { @@ -246,7 +244,7 @@ where { if valid { if count_distinct_values { - estimator.insert(&NativeValue(value)); + distinct_values.insert(NativeValue(value)); } if value != prev { @@ -258,9 +256,13 @@ where } } + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + let distinct = count_distinct_values.then(|| DistinctInfo { - distinct_count: estimator.estimate().max(1), - _marker: PhantomData, + distinct_count: u32::try_from(distinct_values.len()) + .vortex_expect("more than u32::MAX distinct values"), + distinct_values, }); Ok(FloatStats { @@ -273,27 +275,21 @@ where #[cfg(test)] mod tests { - use std::sync::LazyLock; - use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; - use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::buffer; use vortex_error::VortexResult; - use vortex_session::VortexSession; use super::FloatStats; use crate::stats::GenerateStatsOptions; - static SESSION: LazyLock = - LazyLock::new(|| VortexSession::empty().with::()); - #[test] fn test_float_stats() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); let floats = floats.execute::(&mut ctx)?; @@ -314,7 +310,7 @@ mod tests { #[test] fn test_float_stats_leading_nulls() { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let floats = PrimitiveArray::new( buffer![0.0f32, 1.0f32, 2.0f32], Validity::from_iter([false, true, true]), diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index 90666cf5eab..e436992f4e5 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -6,6 +6,7 @@ use std::hash::Hash; use num_traits::PrimInt; +use rustc_hash::FxBuildHasher; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; @@ -19,28 +20,28 @@ use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_mask::AllOr; +use vortex_utils::aliases::hash_map::HashMap; use super::GenerateStatsOptions; -use super::cardinality::CardinalityEstimator; -use super::cardinality::distinct_count_error_bound; -use super::cardinality::estimate_could_be_at_most; /// Information about the distinct values in an integer array. -/// -/// The `distinct_count` is an estimate computed using Cloudflare's cardinality estimator, which -/// yields exact counts for small cardinalities (<= 128 for the default parameters) and a -/// HyperLogLog++ approximation for larger cardinalities. The most frequent value is tracked using -/// the Boyer-Moore majority candidate algorithm, so `most_frequent_value` and `top_frequency` are -/// only guaranteed to reflect the true majority element when some value accounts for more than -/// half of the non-null entries; otherwise they are treated as a best-effort estimate. #[derive(Debug, Clone)] pub struct DistinctInfo { - /// The estimated count of unique values. This _must_ be non-zero. - distinct_count: usize, - /// The most frequent value (Boyer-Moore majority candidate). + /// The unique values and their occurrences. + distinct_values: HashMap, u32, FxBuildHasher>, + /// The count of unique values. This _must_ be non-zero. + distinct_count: u32, + /// The most frequent value. most_frequent_value: T, - /// The exact number of times `most_frequent_value` occurs in the array. - top_frequency: usize, + /// The number of times the most frequent value occurs. + top_frequency: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values map. + pub fn distinct_values(&self) -> &HashMap, u32, FxBuildHasher> { + &self.distinct_values + } } /// Typed statistics for a specific integer type. @@ -63,12 +64,12 @@ impl TypedStats { impl TypedStats { /// Get the count of distinct values, if we have computed it already. - fn distinct_count(&self) -> Option { + fn distinct_count(&self) -> Option { Some(self.distinct.as_ref()?.distinct_count) } /// Get the most commonly occurring value and its count, if we have computed it already. - fn most_frequent_value_and_count(&self) -> Option<(&T, usize)> { + fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> { let distinct = self.distinct.as_ref()?; Some((&distinct.most_frequent_value, distinct.top_frequency)) } @@ -165,7 +166,7 @@ impl ErasedStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { match &self { ErasedStats::U8(x) => x.distinct_count(), ErasedStats::U16(x) => x.distinct_count(), @@ -179,7 +180,7 @@ impl ErasedStats { } /// Get the most commonly occurring value and its count. - pub fn most_frequent_value_and_count(&self) -> Option<(PValue, usize)> { + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { match &self { ErasedStats::U8(x) => { let (top_value, count) = x.most_frequent_value_and_count()?; @@ -241,11 +242,11 @@ impl_from_typed!(i64, ErasedStats::I64); #[derive(Clone, Debug)] pub struct IntegerStats { /// Cache for `validity.false_count()`. - null_count: usize, + null_count: u32, /// Cache for `validity.true_count()`. - value_count: usize, + value_count: u32, /// The average run length. - average_run_length: usize, + average_run_length: u32, /// Type-erased typed statistics. erased: ErasedStats, } @@ -263,42 +264,12 @@ impl IntegerStats { } /// Get the count of distinct values, if we have computed it already. - pub fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.erased.distinct_count() } - /// Returns true if the true distinct count could plausibly equal `count`. - /// - /// Callers pass an upper bound on the true distinct count (the array length), so an estimate - /// at or above `count` is pure estimator overshoot and cannot rule out "every value is - /// distinct" — we accept it. Below `count` we only rule it out when the estimate falls short by - /// more than the estimator's expected error, so estimator noise does not cause us to skip - /// genuinely all-distinct arrays such as arithmetic sequences. - pub fn estimated_distinct_count_could_equal(&self, count: usize) -> bool { - let Some(distinct_count) = self.distinct_count() else { - return true; - }; - - // The true distinct count never exceeds `count`, so an over-estimate reflects estimator - // noise and remains consistent with all values being distinct. - if distinct_count >= count { - return true; - } - - count - distinct_count <= distinct_count_error_bound(count) - } - - /// Returns true if the true distinct count could plausibly be at most `count`. - pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { - let Some(distinct_count) = self.distinct_count() else { - return true; - }; - - estimate_could_be_at_most(distinct_count, count) - } - /// Get the most commonly occurring value and its count, if we have computed it already. - pub fn most_frequent_value_and_count(&self) -> Option<(PValue, usize)> { + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { self.erased.most_frequent_value_and_count() } } @@ -320,17 +291,17 @@ impl IntegerStats { } /// Returns the number of null values. - pub fn null_count(&self) -> usize { + pub fn null_count(&self) -> u32 { self.null_count } /// Returns the number of non-null values. - pub fn value_count(&self) -> usize { + pub fn value_count(&self) -> u32 { self.value_count } /// Returns the average run length. - pub fn average_run_length(&self) -> usize { + pub fn average_run_length(&self) -> u32 { self.average_run_length } @@ -368,13 +339,14 @@ where if array.all_invalid(ctx)? { return Ok(IntegerStats { - null_count: array.len(), + null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, erased: TypedStats { min: T::max_value(), max: T::min_value(), distinct: Some(DistinctInfo { + distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher), distinct_count: 0, most_frequent_value: T::zero(), top_frequency: 0, @@ -398,10 +370,12 @@ where let buffer = array.to_buffer::(); let head = buffer[head_idx]; - let mut loop_state = LoopState:: { - estimator: CardinalityEstimator::new(), - bm_candidate: head, - bm_votes: 0, + let mut loop_state = LoopState { + distinct_values: if count_distinct_values { + HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashMap::with_hasher(FxBuildHasher) + }, prev: head, runs: 1, }; @@ -476,28 +450,26 @@ where .vortex_expect("max should be computed"); let distinct = count_distinct_values.then(|| { - // The cardinality estimator is exact for small cardinalities and approximate beyond. - // We clamp to at least 1 because we are inside the non-empty/non-all-null branch. - let distinct_count = loop_state.estimator.estimate().max(1); - - // Count the Boyer-Moore majority candidate exactly via a second pass. If any value - // accounts for more than half of the non-null entries, this counts that value; otherwise - // the returned count is a best-effort estimate for whichever candidate survived. - let top_frequency = count_occurrences::( - buffer.as_slice(), - validity.bit_buffer(), - loop_state.bm_candidate, - ); + let (&top_value, &top_count) = loop_state + .distinct_values + .iter() + .max_by_key(|&(_, &count)| count) + .vortex_expect("we know this is non-empty"); DistinctInfo { - distinct_count, - most_frequent_value: loop_state.bm_candidate, - top_frequency, + distinct_count: u32::try_from(loop_state.distinct_values.len()) + .vortex_expect("there are more than `u32::MAX` distinct values"), + most_frequent_value: top_value.0, + top_frequency: top_count, + distinct_values: loop_state.distinct_values, } }); let typed = TypedStats { min, max, distinct }; + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + Ok(IntegerStats { null_count, value_count, @@ -507,57 +479,13 @@ where } /// Internal loop state for integer stats computation. -struct LoopState -where - T: IntegerPType, - NativeValue: Eq + Hash, -{ +struct LoopState { /// The previous value seen. prev: T, /// The run count. - runs: usize, - /// Cloudflare's cardinality estimator, used to approximate the number of distinct values - /// without materializing an exact hash map. - estimator: CardinalityEstimator>, - /// Boyer-Moore majority candidate; holds the current candidate for the most frequent value. - bm_candidate: T, - /// Boyer-Moore vote counter for `bm_candidate`. - bm_votes: usize, -} - -/// Updates the Boyer-Moore majority-vote state for a single value. -#[inline(always)] -fn boyer_moore_observe(state: &mut LoopState, value: T) -where - T: IntegerPType, - NativeValue: Eq + Hash, -{ - if state.bm_votes == 0 { - state.bm_candidate = value; - state.bm_votes = 1; - } else if value == state.bm_candidate { - state.bm_votes += 1; - } else { - state.bm_votes -= 1; - } -} - -/// Counts exact occurrences of `needle` in `buffer`, restricted to valid positions according to -/// `validity`. -fn count_occurrences( - buffer: &[T], - validity: AllOr<&BitBuffer>, - needle: T, -) -> usize { - match validity { - AllOr::All => buffer.iter().filter(|&&v| v == needle).count(), - AllOr::None => 0, - AllOr::Some(mask) => buffer - .iter() - .enumerate() - .filter(|&(idx, &v)| mask.value(idx) && v == needle) - .count(), - } + runs: u32, + /// The distinct values map. + distinct_values: HashMap, u32, FxBuildHasher>, } /// Inner loop for non-null chunks of 64 values. @@ -571,8 +499,7 @@ fn inner_loop_nonnull( { for &value in values { if count_distinct_values { - state.estimator.insert(&NativeValue(value)); - boyer_moore_observe(state, value); + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; } if value != state.prev { @@ -595,8 +522,7 @@ fn inner_loop_nullable( for (idx, &value) in values.iter().enumerate() { if is_valid.value(idx) { if count_distinct_values { - state.estimator.insert(&NativeValue(value)); - boyer_moore_observe(state, value); + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; } if value != state.prev { @@ -620,8 +546,7 @@ fn inner_loop_naive( for (idx, &value) in values.iter().enumerate() { if is_valid.value(idx) { if count_distinct_values { - state.estimator.insert(&NativeValue(value)); - boyer_moore_observe(state, value); + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; } if value != state.prev { @@ -636,18 +561,16 @@ fn inner_loop_naive( mod tests { use std::iter; use std::sync::LazyLock; - + use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; - use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_session::VortexSession; - use super::IntegerStats; use super::typed_int_stats; @@ -655,7 +578,7 @@ mod tests { #[test] fn test_naive_count_distinct_values() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 2); @@ -664,7 +587,7 @@ mod tests { #[test] fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = PrimitiveArray::new( buffer![217u8, 0], Validity::from(BitBuffer::from(vec![true, false])), @@ -676,7 +599,7 @@ mod tests { #[test] fn test_count_distinct_values() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 128); @@ -685,7 +608,7 @@ mod tests { #[test] fn test_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let array = PrimitiveArray::new( (0..128u8).collect::>(), Validity::from(BitBuffer::from_iter( @@ -699,7 +622,7 @@ mod tests { #[test] fn test_integer_stats_leading_nulls() { - let mut ctx = SESSION.create_execution_ctx(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); let stats = IntegerStats::generate_opts( @@ -715,77 +638,4 @@ mod tests { assert_eq!(stats.average_run_length, 1); assert_eq!(stats.distinct_count().unwrap(), 2); } - - #[test] - fn test_most_frequent_value_dominates() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - // A value that appears in 95% of the array must be recovered exactly by the - // Boyer-Moore tracking plus second-pass count. - let top = -1i32; - let mut data: Vec = vec![top; 950]; - data.extend(0..50i32); - let array = PrimitiveArray::new(Buffer::copy_from(&data), Validity::NonNullable); - let stats = typed_int_stats::(&array, true, &mut ctx)?; - let (top_value, top_count) = stats - .erased() - .most_frequent_value_and_count() - .expect("distinct info must be present"); - assert_eq!(top_value, top.into()); - assert_eq!(top_count, 950); - Ok(()) - } - - #[test] - fn test_cardinality_estimate_large_unique() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - // For 1024 distinct values the estimator falls back to HyperLogLog++; verify the - // estimate is within the expected error bound (~1.6% for the default P/W). - let array = - PrimitiveArray::new((0..1024u32).collect::>(), Validity::NonNullable); - let stats = typed_int_stats::(&array, true, &mut ctx)?; - let estimated = stats.distinct_count().unwrap(); - let error_ratio = (estimated as f64 - 1024.0).abs() / 1024.0; - assert!( - error_ratio < 0.05, - "estimator error {error_ratio} exceeds 5% for 1024 distinct values" - ); - Ok(()) - } - - #[test] - fn test_estimated_distinct_count_could_equal() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - - let unique = - PrimitiveArray::new((0..1024u32).collect::>(), Validity::NonNullable); - let unique_stats = typed_int_stats::(&unique, true, &mut ctx)?; - assert!(unique_stats.estimated_distinct_count_could_equal(1024)); - - let low_cardinality = PrimitiveArray::new( - (0..1024u32).map(|value| value % 8).collect::>(), - Validity::NonNullable, - ); - let low_cardinality_stats = typed_int_stats::(&low_cardinality, true, &mut ctx)?; - assert!(!low_cardinality_stats.estimated_distinct_count_could_equal(1024)); - - Ok(()) - } - - #[test] - fn test_all_distinct_could_equal_despite_estimator_drift() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - - // Regression: an arithmetic sequence is all-distinct, but the HLL++ estimate for ~10k - // values drifts a few percent off the exact count (here it overshoots to ~10457). The - // sequence scheme gates on `estimated_distinct_count_could_equal`, so this must remain true - // or the array is never sequence-encoded. - let range = PrimitiveArray::new( - (0..10_000u32).collect::>(), - Validity::NonNullable, - ); - let range_stats = typed_int_stats::(&range, true, &mut ctx)?; - assert!(range_stats.estimated_distinct_count_could_equal(10_000)); - - Ok(()) - } } diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs index 1449846215a..8f6e7720837 100644 --- a/vortex-compressor/src/stats/mod.rs +++ b/vortex-compressor/src/stats/mod.rs @@ -5,7 +5,6 @@ mod bool; mod cache; -mod cardinality; mod float; mod integer; mod options; diff --git a/vortex-compressor/src/stats/varbinview.rs b/vortex-compressor/src/stats/varbinview.rs index 47c5b8c12d2..486d431e684 100644 --- a/vortex-compressor/src/stats/varbinview.rs +++ b/vortex-compressor/src/stats/varbinview.rs @@ -5,83 +5,42 @@ use vortex_array::ExecutionCtx; use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::varbinview::BinaryView; -use vortex_buffer::ByteBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_mask::AllOr; +use vortex_utils::aliases::hash_set::HashSet; use super::GenerateStatsOptions; -use super::cardinality::CardinalityEstimator; -use super::cardinality::estimate_could_be_at_most; /// Array of variable-length byte/string values, and relevant stats for compression. #[derive(Clone, Debug)] pub struct StringStats { /// The estimated number of distinct values, or `None` if not computed. /// This _must_ be non-zero. - estimated_distinct_count: Option, + estimated_distinct_count: Option, /// The number of non-null values. - value_count: usize, + value_count: u32, /// The number of null values. - null_count: usize, + null_count: u32, } -/// Estimate the number of distinct strings in the var bin view array using Cloudflare's -/// cardinality estimator. -/// -/// Every non-null value is hashed in full, so the estimate's only error is the estimator's own: -/// it is exact for small cardinalities and transitions to HyperLogLog++ for larger ones. Null -/// entries are skipped, matching the integer and float distinct-count stats. -fn estimate_distinct_count( - strings: &VarBinViewArray, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let views = strings.views(); - let validity = strings - .as_ref() - .validity()? - .execute_mask(strings.len(), ctx)?; - let mut estimator: CardinalityEstimator<[u8]> = CardinalityEstimator::new(); - let buffers = strings - .data_buffers() - .iter() - .map(|b| b.as_host()) - .collect::>(); - - match validity.bit_buffer() { - AllOr::All => { - for view in views { - estimator.insert(view_bytes(&buffers, view)); - } - } - // Every value is null, so there is nothing to count. - AllOr::None => {} - AllOr::Some(is_valid) => { - for (idx, view) in views.iter().enumerate() { - if is_valid.value(idx) { - estimator.insert(view_bytes(&buffers, view)); - } - } - } - } - - Ok(estimator.estimate()) -} - -/// Returns the full bytes backing a single view, reading from the array's data buffers when the -/// value is not inlined. -/// -/// Only call this for non-null positions: a null slot's view may hold arbitrary bytes whose buffer -/// index and offset are not safe to dereference. -fn view_bytes<'a>(buffers: &[&'a ByteBuffer], view: &'a BinaryView) -> &'a [u8] { - if view.is_inlined() { - view.as_inlined().value() - } else { - let r = view.as_view(); - &buffers[r.buffer_index as usize][r.as_range()] - } +/// Estimate the number of distinct values in the var bin view array. +fn estimate_distinct_count(varbinview: &VarBinViewArray) -> VortexResult { + let views = varbinview.views(); + // Iterate the views. Two values which are equal must have the same first 8-bytes. + // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all + // share a 4-byte prefix and have the same length. + let mut distinct = HashSet::with_capacity(views.len() / 2); + views.iter().for_each(|&view| { + #[expect( + clippy::cast_possible_truncation, + reason = "approximate uniqueness with view prefix" + )] + let len_and_prefix = view.as_u128() as u64; + distinct.insert(len_and_prefix); + }); + + Ok(u32::try_from(distinct.len())?) } impl StringStats { @@ -98,13 +57,13 @@ impl StringStats { let value_count = input.len() - null_count; let estimated_distinct_count = opts .count_distinct_values - .then(|| estimate_distinct_count(input, ctx)) + .then(|| estimate_distinct_count(input)) .transpose()?; Ok(Self { + value_count: u32::try_from(value_count)?, + null_count: u32::try_from(null_count)?, estimated_distinct_count, - value_count, - null_count, }) } } @@ -127,28 +86,18 @@ impl StringStats { /// Returns the estimated number of distinct values, or `None` if not computed. /// - /// The estimate is exact for small cardinalities and an approximation (which may be slightly - /// above or below the true count) for larger ones. - pub fn estimated_distinct_count(&self) -> Option { + /// This estimation is always going to be less than or equal to the actual distinct count. + pub fn estimated_distinct_count(&self) -> Option { self.estimated_distinct_count } - /// Returns true if the true distinct count could plausibly be at most `count`. - pub fn estimated_distinct_count_could_be_at_most(&self, count: usize) -> bool { - let Some(distinct_count) = self.estimated_distinct_count else { - return true; - }; - - estimate_could_be_at_most(distinct_count, count) - } - /// Returns the number of non-null values. - pub fn value_count(&self) -> usize { + pub fn value_count(&self) -> u32 { self.value_count } /// Returns the number of null values. - pub fn null_count(&self) -> usize { + pub fn null_count(&self) -> u32 { self.null_count } } From dedff72e32da1815fd5a677b801f9a6a265c2c5d Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 9 Jun 2026 15:50:01 +0100 Subject: [PATCH 11/16] more Signed-off-by: Robert Kruszewski --- vortex-layout/src/layouts/dict/writer.rs | 45 +++++++++++++++++------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/vortex-layout/src/layouts/dict/writer.rs b/vortex-layout/src/layouts/dict/writer.rs index f83c426b55b..1a376452984 100644 --- a/vortex-layout/src/layouts/dict/writer.rs +++ b/vortex-layout/src/layouts/dict/writer.rs @@ -20,8 +20,8 @@ use futures::stream::once; use futures::try_join; use vortex_array::ArrayContext; use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::Dict; use vortex_array::builders::dict::DictConstraints; @@ -170,7 +170,11 @@ impl LayoutStrategy for DictStrategy { // 1. from a chunk stream, create a stream that yields codes // followed by a single value chunk when dict constraints are hit. // (a1, a2) -> (code(c1), code(c2), values(v1), code(c3), ...) - let dict_stream = dict_encode_stream(stream, options.constraints.into()); + let dict_stream = dict_encode_stream( + stream, + options.constraints.into(), + session.create_execution_ctx(), + ); // Wrap up the dict stream to yield pairs of (codes_stream, values_future). // Each of these pairs becomes a child dict layout. @@ -263,6 +267,7 @@ type DictionaryStream = BoxStream<'static, VortexResult>; fn dict_encode_stream( input: SendableSequentialStream, constraints: DictConstraints, + mut exec_ctx: ExecutionCtx, ) -> DictionaryStream { Box::pin(try_stream! { let mut state = DictStreamState { @@ -282,7 +287,7 @@ fn dict_encode_stream( match input.as_mut().peek().await { Some(_) => { let mut labeler = DictChunkLabeler::new(sequence_id); - let chunks = state.encode(&mut labeler, chunk)?; + let chunks = state.encode(&mut labeler, chunk, &mut exec_ctx)?; drop(labeler); for dict_chunk in chunks { yield dict_chunk; @@ -291,7 +296,7 @@ fn dict_encode_stream( None => { // this is the last element, encode and drain chunks let mut labeler = DictChunkLabeler::new(sequence_id); - let encoded = state.encode(&mut labeler, chunk)?; + let encoded = state.encode(&mut labeler, chunk, &mut exec_ctx)?; let drained = state.drain_values(&mut labeler); drop(labeler); for dict_chunk in encoded.into_iter().chain(drained.into_iter()) { @@ -313,12 +318,13 @@ impl DictStreamState { &mut self, labeler: &mut DictChunkLabeler, chunk: ArrayRef, + exec_ctx: &mut ExecutionCtx, ) -> VortexResult> { let mut res = Vec::new(); let mut to_be_encoded = Some(chunk); while let Some(remaining) = to_be_encoded.take() { match self.encoder.take() { - None => match start_encoding(&self.constraints, &remaining)? { + None => match start_encoding(&self.constraints, &remaining, exec_ctx)? { EncodingState::Continue((encoder, encoded)) => { let ptype = encoder.codes_ptype(); res.push(labeler.codes(encoded, ptype)); @@ -335,7 +341,7 @@ impl DictStreamState { }, Some(encoder) => { let ptype = encoder.codes_ptype(); - match encode_chunk(encoder, &remaining)? { + match encode_chunk(encoder, &remaining, exec_ctx)? { EncodingState::Continue((encoder, encoded)) => { res.push(labeler.codes(encoded, ptype)); self.encoder = Some(encoder); @@ -550,18 +556,21 @@ enum EncodingState { Done((ArrayRef, ArrayRef, ArrayRef)), } -fn start_encoding(constraints: &DictConstraints, chunk: &ArrayRef) -> VortexResult { +fn start_encoding( + constraints: &DictConstraints, + chunk: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { let encoder = dict_encoder(chunk, constraints); - encode_chunk(encoder, chunk) + encode_chunk(encoder, chunk, ctx) } fn encode_chunk( mut encoder: Box, chunk: &ArrayRef, + ctx: &mut ExecutionCtx, ) -> VortexResult { - let encoded = encoder - .encode(chunk, &mut LEGACY_SESSION.create_execution_ctx())? - .into_array(); + let encoded = encoder.encode(chunk, ctx)?.into_array(); match remainder(chunk, encoded.len())? { None => Ok(EncodingState::Continue((encoder, encoded))), Some(unencoded) => Ok(EncodingState::Done((encoder.reset(), encoded, unencoded))), @@ -578,13 +587,18 @@ fn remainder(array: &ArrayRef, encoded_len: usize) -> VortexResult = + LazyLock::new(|| VortexSession::empty().with::()); + /// Regression test for a bug where the codes stream dtype was hardcoded to U16 instead of /// using the actual codes dtype from the array. When `max_len <= 255`, the dict encoder /// produces U8 codes, but the stream was incorrectly typed as U16, causing a dtype mismatch @@ -617,7 +634,8 @@ mod tests { .sendable(); // Encode into dict chunks. - let dict_stream = dict_encode_stream(input_stream, constraints); + let dict_stream = + dict_encode_stream(input_stream, constraints, SESSION.create_execution_ctx()); // Transform into codes/values streams. let mut transformer = DictionaryTransformer::new(dict_stream); @@ -659,7 +677,8 @@ mod tests { .sendable(); // Encode into dict chunks. - let dict_stream = dict_encode_stream(input_stream, constraints); + let dict_stream = + dict_encode_stream(input_stream, constraints, SESSION.create_execution_ctx()); // Transform into codes/values streams. let mut transformer = DictionaryTransformer::new(dict_stream); From 7179a3980c2707356a6063c1801a35e41e1f57d9 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 9 Jun 2026 16:28:41 +0100 Subject: [PATCH 12/16] format Signed-off-by: Robert Kruszewski --- vortex-array/benches/dict_compare.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/vortex-array/benches/dict_compare.rs b/vortex-array/benches/dict_compare.rs index 5ef450173f5..29e1516e2b6 100644 --- a/vortex-array/benches/dict_compare.rs +++ b/vortex-array/benches/dict_compare.rs @@ -48,9 +48,6 @@ const LENGTH_AND_UNIQUE_VALUES: &[(usize, usize)] = &[ (100_000, 2048), ]; -static SESSION: LazyLock = - LazyLock::new(|| VortexSession::empty().with::()); - #[divan::bench(args = LENGTH_AND_UNIQUE_VALUES)] fn bench_compare_primitive(bencher: divan::Bencher, (len, uniqueness): (usize, usize)) { let primitive_arr = gen_primitive_for_dict::(len, uniqueness); From 067bf73b127e5d70f5f61c999ee022abef6c3dfe Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Fri, 19 Jun 2026 15:58:39 +0100 Subject: [PATCH 13/16] imports Signed-off-by: Robert Kruszewski --- vortex-compressor/benches/dict_encode.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vortex-compressor/benches/dict_encode.rs b/vortex-compressor/benches/dict_encode.rs index c3d3d2e3617..2c4e24108a7 100644 --- a/vortex-compressor/benches/dict_encode.rs +++ b/vortex-compressor/benches/dict_encode.rs @@ -3,9 +3,10 @@ #![expect(clippy::unwrap_used)] +use std::sync::LazyLock; + use divan::Bencher; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; From b4007b2aed3a5b9158ae5c2f8351d566a50acb7d Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 22 Jun 2026 15:57:38 +0100 Subject: [PATCH 14/16] comments Signed-off-by: Robert Kruszewski --- vortex-array/src/builders/dict/bytes.rs | 4 ++++ vortex-array/src/builders/dict/primitive.rs | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/vortex-array/src/builders/dict/bytes.rs b/vortex-array/src/builders/dict/bytes.rs index d2e5455d7b7..00f14388aa2 100644 --- a/vortex-array/src/builders/dict/bytes.rs +++ b/vortex-array/src/builders/dict/bytes.rs @@ -89,6 +89,8 @@ impl BytesDictBuilder { } } + /// Returns `None` when assigning a code would exceed the dictionary constraints, + /// and callers should stop encoding after the current prefix. fn encode_value(&mut self, lookup: &mut HashTable, val: &[u8]) -> Option { match lookup.entry( self.hasher.hash_one(val), @@ -131,6 +133,8 @@ impl BytesDictBuilder { } } + /// Returns `None` when assigning the null code would exceed the dictionary constraints, + /// and callers should stop encoding after the current prefix. fn encode_null(&mut self) -> Option { if let Some(code) = self.null_code.get() { return Some(*code); diff --git a/vortex-array/src/builders/dict/primitive.rs b/vortex-array/src/builders/dict/primitive.rs index e8d36d128f2..dea2932367b 100644 --- a/vortex-array/src/builders/dict/primitive.rs +++ b/vortex-array/src/builders/dict/primitive.rs @@ -82,6 +82,8 @@ where } } + /// Returns `None` when assigning a code would exceed the dictionary constraints, + /// and callers should stop encoding after the current prefix. fn encode_value(&mut self, v: T) -> Option { match self.lookup.entry(NativeValue(v)) { Entry::Occupied(o) => Some(*o.get()), @@ -99,6 +101,8 @@ where } } + /// Returns `None` when assigning the null code would exceed the dictionary constraints, + /// and callers should stop encoding after the current prefix. fn encode_null(&mut self) -> Option { if let Some(code) = self.null_code.get() { return Some(*code); From a74bedbe1356dce187b93834a501b31d5a27b06e Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 23 Jun 2026 17:45:38 +0100 Subject: [PATCH 15/16] less Signed-off-by: Robert Kruszewski --- Cargo.lock | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f36678b069c..c7c50f5047e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3592,21 +3592,6 @@ dependencies = [ ] [[package]] -<<<<<<< HEAD -name = "enum_dispatch" -version = "0.3.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" -dependencies = [ - "once_cell", - "proc-macro2", - "quote", - "syn 2.0.118", -] - -[[package]] -======= ->>>>>>> 1db45e55e (less) name = "env_filter" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" From 0e100634a2de59d3a7fe41934d4b861ebb62b9d1 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 23 Jun 2026 17:48:10 +0100 Subject: [PATCH 16/16] fixes Signed-off-by: Robert Kruszewski --- vortex-array/src/arrays/dict/compute/cast.rs | 12 +++++------- vortex-compressor/src/stats/bool.rs | 9 ++++----- vortex-compressor/src/stats/float.rs | 5 ++--- vortex-compressor/src/stats/integer.rs | 17 +++++++---------- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/vortex-array/src/arrays/dict/compute/cast.rs b/vortex-array/src/arrays/dict/compute/cast.rs index 215ebddaab6..56a433627a2 100644 --- a/vortex-array/src/arrays/dict/compute/cast.rs +++ b/vortex-array/src/arrays/dict/compute/cast.rs @@ -49,6 +49,7 @@ mod tests { use std::sync::LazyLock; use rstest::rstest; + use vortex_buffer::BitBuffer; use vortex_buffer::buffer; use vortex_session::VortexSession; @@ -56,6 +57,7 @@ mod tests { use crate::IntoArray; use crate::VortexSessionExecute; use crate::arrays::Dict; + use crate::arrays::DictArray; use crate::arrays::PrimitiveArray; use crate::arrays::dict::DictArraySlotsExt; use crate::assert_arrays_eq; @@ -65,6 +67,7 @@ mod tests { use crate::dtype::DType; use crate::dtype::Nullability; use crate::dtype::PType; + use crate::validity::Validity; static SESSION: LazyLock = LazyLock::new(crate::array_session); @@ -192,7 +195,7 @@ mod tests { // Codes: [0, 2, 0] (only reference indices 0 and 2, never 1) let values = PrimitiveArray::new( buffer![1.0f64, 0.0f64, 3.0f64], - Validity::from(vortex_buffer::BitBuffer::from(vec![true, false, true])), + Validity::from(BitBuffer::from(vec![true, false, true])), ) .into_array(); let codes = buffer![0u32, 2, 0].into_array(); @@ -217,11 +220,6 @@ mod tests { casted.dtype(), &DType::Primitive(PType::F64, Nullability::NonNullable) ); - let casted_prim = casted.execute::(ctx).unwrap(); - assert_arrays_eq!( - casted_prim, - PrimitiveArray::from_iter([1.0f64, 3.0, 1.0]), - ctx - ); + assert_arrays_eq!(casted, PrimitiveArray::from_iter([1.0f64, 3.0, 1.0]), ctx); } } diff --git a/vortex-compressor/src/stats/bool.rs b/vortex-compressor/src/stats/bool.rs index b9f2eec7b70..fbbc4001b90 100644 --- a/vortex-compressor/src/stats/bool.rs +++ b/vortex-compressor/src/stats/bool.rs @@ -92,7 +92,6 @@ impl BoolStats { #[cfg(test)] mod tests { - use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::BoolArray; @@ -104,7 +103,7 @@ mod tests { #[test] fn test_all_true() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, true, true]), Validity::NonNullable, @@ -119,7 +118,7 @@ mod tests { #[test] fn test_all_false() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![false, false, false]), Validity::NonNullable, @@ -134,7 +133,7 @@ mod tests { #[test] fn test_mixed() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::NonNullable, @@ -149,7 +148,7 @@ mod tests { #[test] fn test_with_nulls() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = BoolArray::new( BitBuffer::from(vec![true, false, true]), Validity::from_iter([true, false, true]), diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index 6c7e5360938..c505993e9e5 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -276,7 +276,6 @@ where #[cfg(test)] mod tests { use vortex_array::IntoArray; - use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; @@ -289,7 +288,7 @@ mod tests { #[test] fn test_float_stats() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); let floats = floats.execute::(&mut ctx)?; @@ -310,7 +309,7 @@ mod tests { #[test] fn test_float_stats_leading_nulls() { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let floats = PrimitiveArray::new( buffer![0.0f32, 1.0f32, 2.0f32], Validity::from_iter([false, true, true]), diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index e436992f4e5..a399c1060f9 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -560,8 +560,7 @@ fn inner_loop_naive( #[cfg(test)] mod tests { use std::iter; - use std::sync::LazyLock; - use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; use vortex_array::array_session; use vortex_array::arrays::PrimitiveArray; @@ -570,15 +569,13 @@ mod tests { use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; - use vortex_session::VortexSession; + use super::IntegerStats; use super::typed_int_stats; - static SESSION: LazyLock = LazyLock::new(array_session); - #[test] fn test_naive_count_distinct_values() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 2); @@ -587,7 +584,7 @@ mod tests { #[test] fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = PrimitiveArray::new( buffer![217u8, 0], Validity::from(BitBuffer::from(vec![true, false])), @@ -599,7 +596,7 @@ mod tests { #[test] fn test_count_distinct_values() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); let stats = typed_int_stats::(&array, true, &mut ctx)?; assert_eq!(stats.distinct_count().unwrap(), 128); @@ -608,7 +605,7 @@ mod tests { #[test] fn test_count_distinct_values_nullable() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let array = PrimitiveArray::new( (0..128u8).collect::>(), Validity::from(BitBuffer::from_iter( @@ -622,7 +619,7 @@ mod tests { #[test] fn test_integer_stats_leading_nulls() { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = array_session().create_execution_ctx(); let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); let stats = IntegerStats::generate_opts(