refactor(parquet): drop ValueCountStrategy, count values via def levels

adriangb · claude · adriangb · commit beb5fc25f0c1 · 2026-05-15T15:47:31.000Z
`ValueCountStrategy` was a 3-way precomputed enum (`AllPresent` / `Sorted`
/ `DefLevelScan`) for answering "how many of this chunk's levels carry a
value". `LevelDataRef::value_count` already answers that correctly for
every column shape — `Absent`/`Uniform` def levels resolve in O(1), and
the O(n) scan only runs for genuinely materialized (nullable/nested) def
levels, on the variable-width slow path the chunker is already on.

The `Sorted` variant — `partition_point` of leaf-value indices against a
level offset — was only ever valid for flat columns; for nested columns
those indices live in a different coordinate space, which is what made
`vals_in_chunk` drift and spuriously trigger granular sub-batching
(`list_primitive_non_null` regression). Deleting the enum removes that
bug class structurally rather than guarding against it.

Net effect: the chunker module drops from ~320 to ~173 lines, the
`'a` lifetime and two parameters disappear from the chunker API, and
`ByteBudgetChunker` just stores `max_def_level`. `pick_sub_batch_size`
goes back to a plain `#[inline]` (the `#[inline(always)]` was added
chasing a `string_dictionary` swing later confirmed to be code-layout
noise, not an inlining effect). Perf-neutral — `value_count` vs the old
`partition_point` is negligible and only on the post-dict-spill path.

`LevelDataRef::value_count` gains a unit test as the now load-bearing
value-counting primitive.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/parquet/src/column/writer/byte_budget_chunker.rs b/parquet/src/column/writer/byte_budget_chunker.rs
@@ -21,49 +21,30 @@
 //! The parquet column writer checks the data page byte limit only *after*
 //! each mini-batch finishes writing. Mini-batches are sized in rows
 //! (`write_batch_size`, default 1024), so for BYTE_ARRAY columns whose
-//! values are large (e.g. multi-MiB blobs), a single mini-batch can buffer
+//! values are large (e.g. multi-MiB blobs) a single mini-batch can buffer
 //! GiB into one page before the limit is consulted.
 //!
 //! This module isolates the per-chunk decision that prevents that: given a
 //! chunk's level data and the input values, pick the largest `sub_batch_size`
-//! such that one mini-batch will fit in one page byte budget. For the
-//! overwhelmingly common case (small values), the answer is just
-//! `chunk_size` and the decision is O(1) on the column type. Only when the
-//! input might overflow does the chunker consult the encoder's byte
-//! estimate.
+//! such that one mini-batch fits in one page byte budget. For the
+//! overwhelmingly common case (small or fixed-width values) the answer is
+//! just `chunk_size` and the decision is O(1) on the column type — only
+//! when the input might overflow does the chunker consult the encoder's
+//! byte estimate.
 
 use crate::basic::Type;
 use crate::column::writer::LevelDataRef;
 use crate::column::writer::encoder::ColumnValueEncoder;
 use crate::file::properties::WriterProperties;
 use crate::schema::types::ColumnDescriptor;
 
-/// Strategy for counting how many values fall in a chunk's level range.
-/// Computed once per `write_batch_internal` call rather than per chunk so
-/// `partition_point` and `LevelDataRef::value_count` don't run when their
-/// answer is statically known to be `chunk_size`.
-#[derive(Clone, Copy)]
-pub(crate) enum ValueCountStrategy<'a> {
-    /// Every level corresponds to a non-null value, so the answer is
-    /// always `chunk_size`. Either the column has `max_def_level == 0`
-    /// or the arrow caller's `non_null_indices.len() == num_levels`.
-    AllPresent,
-    /// Flat (`max_rep_level == 0`) arrow nullable path: `non_null_indices`
-    /// hold row positions, which coincide with level offsets, so
-    /// `partition_point` over the chunk's level range counts values
-    /// directly. O(log n) per chunk.
-    Sorted(&'a [usize]),
-    /// Scan the chunk's def-level slice for entries matching `max_def`.
-    /// O(n) per chunk. Used for the non-arrow nullable path and for
-    /// repeated/nested columns, where `value_indices` index into the
-    /// decoupled leaf values array rather than the level stream.
-    DefLevelScan(i16),
-}
-
-/// Per-column-open chunker that picks byte-budget-aware mini-batch sizes.
+/// Picks byte-budget-aware mini-batch sizes for one column.
 pub(crate) struct ByteBudgetChunker {
     /// Configured data page byte limit for the column.
     page_byte_limit: usize,
+    /// Max definition level of the column; a level equal to this marks a
+    /// present (non-null) leaf value. Used to count values per chunk.
+    max_def_level: i16,
     /// `true` when no chunk of `base_batch_size` values can ever overflow
     /// `page_byte_limit` regardless of input. Set once at column open from
     /// the physical type's known per-value byte size; lets the per-chunk
@@ -93,142 +74,76 @@ impl ByteBudgetChunker {
             .unwrap_or(false);
         Self {
             page_byte_limit,
+            max_def_level: descr.max_def_level(),
             static_always_fits,
         }
     }
 
-    /// Pick the cheapest strategy for `vals_in_chunk` queries for this
-    /// `write_batch_internal` call. Computed once and reused per chunk so
-    /// we don't repeat the check on every iteration.
-    #[inline]
-    pub(crate) fn value_count_strategy<'a>(
-        descr: &ColumnDescriptor,
-        value_indices: Option<&'a [usize]>,
-        num_levels: usize,
-    ) -> ValueCountStrategy<'a> {
-        match value_indices {
-            // Arrow path. If every level has a non-null value, the gather
-            // index is the trivial `0..num_levels` and we don't need to
-            // walk it per chunk — `vals_in_chunk == chunk_size` by
-            // construction.
-            Some(idx) if idx.len() == num_levels => ValueCountStrategy::AllPresent,
-            // Repeated/nested arrow columns: `value_indices` index into the
-            // leaf values array, which is decoupled from the rep/def level
-            // stream. A `partition_point` of those indices against a level
-            // offset is meaningless — it makes `vals_in_chunk` drift away
-            // from the true per-chunk value count (it grows without bound
-            // as empty-list / sub-`max_def` levels accumulate, eventually
-            // forcing spurious granular sub-batching). Count via def levels
-            // instead. The `Sorted` fast path is only valid for flat
-            // columns, where `non_null_indices` are row positions that
-            // coincide with level offsets.
-            Some(_) if descr.max_rep_level() > 0 => {
-                ValueCountStrategy::DefLevelScan(descr.max_def_level())
-            }
-            Some(idx) => ValueCountStrategy::Sorted(idx),
-            // Non-arrow path. `max_def_level == 0` means the column has
-            // no nullability, so again `vals_in_chunk == chunk_size`.
-            None if descr.max_def_level() == 0 => ValueCountStrategy::AllPresent,
-            None => ValueCountStrategy::DefLevelScan(descr.max_def_level()),
-        }
-    }
-
-    /// Decide how many levels at the start of `chunk_def` belong in one
-    /// mini-batch.
-    ///
-    /// Returns `chunk_size` when the whole chunk fits in one page byte
-    /// budget. A smaller number triggers granular sub-batching in
-    /// `write_batch_internal`'s `write_granular_chunk` arm.
+    /// Decide how many levels at the start of a chunk belong in one
+    /// mini-batch. Returns `chunk_size` when the whole chunk fits in one
+    /// page byte budget; a smaller value triggers granular sub-batching in
+    /// `write_batch_internal`.
     ///
-    /// Bypasses:
-    /// - When `static_always_fits` is true (fixed-width type with a
-    ///   safe `base_batch_size`), return `chunk_size`.
-    /// - When the encoder is currently dictionary-encoding,
-    ///   `estimated_value_bytes` would return plain-encoded bytes while
-    ///   the actual page only stores small RLE indices, so the budget
-    ///   would shrink pages spuriously. Return `chunk_size` and let
-    ///   dictionary fallback bound dict-encoded pages independently.
-    /// - When `chunk_size == 0`, there's nothing to size.
+    /// Returns `chunk_size` immediately (no value inspection) when:
+    /// - the column is a fixed-width type that statically cannot overflow
+    ///   (`static_always_fits`);
+    /// - the encoder is currently dictionary-encoding — a dict-encoded data
+    ///   page only stores small RLE indices, so a plain-encoded byte
+    ///   estimate would shrink pages spuriously; dictionary fallback bounds
+    ///   those pages independently;
+    /// - the chunk is empty.
     ///
-    /// Hot path: when one of the bypass conditions fires this returns
-    /// `chunk_size` with one struct-field load and one virtual call into
-    /// the encoder. Marked `#[inline(always)]` because LLVM's heuristic
-    /// would otherwise refuse to inline now that the slow path lives
-    /// nearby — the GKE bench showed a +80% regression on
-    /// `string_dictionary/*` when the hint was just `#[inline]`.
-    #[allow(clippy::too_many_arguments)]
-    #[inline(always)]
+    /// `#[inline]`: this is a tiny per-chunk dispatcher; the actual byte
+    /// inspection lives in the out-of-line `byte_budget_sub_batch_size`.
+    #[inline]
     pub(crate) fn pick_sub_batch_size<E: ColumnValueEncoder>(
         &self,
         encoder: &E,
         values: &E::Values,
         value_indices: Option<&[usize]>,
         chunk_def: LevelDataRef<'_>,
-        strategy: ValueCountStrategy<'_>,
         values_offset: usize,
         chunk_size: usize,
-        end_offset: usize,
     ) -> usize {
         if self.static_always_fits || encoder.has_dictionary() || chunk_size == 0 {
             return chunk_size;
         }
-        self.byte_budget_sub_batch_size::<E>(
-            values,
-            value_indices,
-            chunk_def,
-            strategy,
-            values_offset,
-            chunk_size,
-            end_offset,
-        )
+        self.byte_budget_sub_batch_size::<E>(values, value_indices, chunk_def, values_offset, chunk_size)
     }
 
-    /// Cold path: the encoder is plain-encoding and the bypass conditions
-    /// didn't fire, so we have to look at value sizes to decide whether
-    /// the chunk fits. Pulled out of `pick_sub_batch_size` and marked
-    /// `#[inline(never)]` + `#[cold]` so the inlined fast path stays
-    /// small and the dead-code placement signal pushes this body
-    /// physically away from the hot encoder loop's icache footprint.
-    #[allow(clippy::too_many_arguments)]
+    /// Inspect value sizes to decide how much of the chunk fits in a page.
+    ///
+    /// Reached once per chunk for variable-width (`BYTE_ARRAY`) columns
+    /// while plain-encoding — numeric, bool and dictionary-encoded columns
+    /// never get here, so it is `#[cold]` / `#[inline(never)]`: keeping it
+    /// out of line keeps the hot `write_batch_internal` loop small.
     #[inline(never)]
     #[cold]
     fn byte_budget_sub_batch_size<E: ColumnValueEncoder>(
         &self,
         values: &E::Values,
         value_indices: Option<&[usize]>,
         chunk_def: LevelDataRef<'_>,
-        strategy: ValueCountStrategy<'_>,
         values_offset: usize,
         chunk_size: usize,
-        end_offset: usize,
     ) -> usize {
-        // Count how many values fall in this chunk's level range. The
-        // strategy was picked once per `write_batch_internal` call so
-        // the common all-non-null case (every level has a value) skips
-        // the per-chunk binary search and def-level scan entirely.
-        let vals_in_chunk = match strategy {
-            ValueCountStrategy::AllPresent => chunk_size,
-            ValueCountStrategy::Sorted(idx) => {
-                idx[values_offset..].partition_point(|&i| i < end_offset)
-            }
-            ValueCountStrategy::DefLevelScan(max_def) => chunk_def.value_count(chunk_size, max_def),
-        };
+        // How many of this chunk's levels carry an actual value. For a
+        // non-nullable, unrepeated column every level is a value, so
+        // `value_count` is O(1) (`Absent`/`Uniform` def levels); only
+        // nullable or nested columns pay the O(chunk_size) def-level scan.
+        let vals_in_chunk = chunk_def.value_count(chunk_size, self.max_def_level);
         if vals_in_chunk == 0 {
             return chunk_size;
         }
-        // Ask the encoder how many of the next values fit in one page
-        // byte budget. Dispatch on whether the caller supplied gather
-        // indices; this mirrors how `write_mini_batch` picks between
-        // `write_gather` and `write`.
+        // Ask the encoder how many of the next values fit in one page byte
+        // budget. Dispatch on whether the caller supplied gather indices;
+        // this mirrors how `write_mini_batch` picks `write_gather` vs
+        // `write`.
         let fit = match value_indices {
             Some(idx) => {
                 let end = (values_offset + vals_in_chunk).min(idx.len());
                 let start = values_offset.min(end);
-                E::count_values_within_byte_budget_gather(
-                    values,
-                    &idx[start..end],
-                    self.page_byte_limit,
-                )
+                E::count_values_within_byte_budget_gather(values, &idx[start..end], self.page_byte_limit)
             }
             None => E::count_values_within_byte_budget(
                 values,
@@ -240,10 +155,10 @@ impl ByteBudgetChunker {
         match fit {
             None => chunk_size,
             Some(values_per_subbatch) => {
-                // Convert the value count from the encoder back into a
-                // level count. For non-nullable columns this is a no-op;
-                // for nullable, scale by the observed value-to-level
-                // ratio of the current chunk.
+                // Convert the value count back into a level count. For a
+                // non-nullable column this is a no-op; for nullable/nested
+                // columns scale by the chunk's observed value-to-level
+                // ratio.
                 let levels_per_subbatch = if vals_in_chunk == chunk_size {
                     values_per_subbatch
                 } else {
@@ -256,64 +171,3 @@ impl ByteBudgetChunker {
         }
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::basic::Type as PhysicalType;
-    use crate::schema::types::{ColumnPath, Type as SchemaType};
-    use std::sync::Arc;
-
-    fn descr(max_def_level: i16, max_rep_level: i16) -> ColumnDescriptor {
-        let tpe = SchemaType::primitive_type_builder("col", PhysicalType::BYTE_ARRAY)
-            .build()
-            .unwrap();
-        ColumnDescriptor::new(
-            Arc::new(tpe),
-            max_def_level,
-            max_rep_level,
-            ColumnPath::from("col"),
-        )
-    }
-
-    #[test]
-    fn value_count_strategy_uses_def_scan_for_repeated_columns() {
-        // Regression: for a repeated/nested column the arrow `value_indices`
-        // index into the leaf values array, which is decoupled from the
-        // level stream. The `Sorted` strategy's `partition_point` against a
-        // level offset is meaningless there and makes `vals_in_chunk` drift
-        // without bound, spuriously triggering granular sub-batching.
-        // A repeated column with `idx.len() != num_levels` must resolve to
-        // `DefLevelScan`, never `Sorted`.
-        let d = descr(1, 1);
-        let indices = [0usize, 1, 2, 3];
-        let strategy =
-            ByteBudgetChunker::value_count_strategy(&d, Some(&indices), /* num_levels */ 6);
-        assert!(
-            matches!(strategy, ValueCountStrategy::DefLevelScan(1)),
-            "repeated column must count values via def levels, not the \
-             level-offset partition_point"
-        );
-    }
-
-    #[test]
-    fn value_count_strategy_keeps_sorted_for_flat_nullable_columns() {
-        // Flat (`max_rep_level == 0`) nullable columns keep the cheap
-        // `Sorted` strategy: there `non_null_indices` are row positions,
-        // which do coincide with level offsets.
-        let d = descr(1, 0);
-        let indices = [0usize, 2, 5];
-        let strategy =
-            ByteBudgetChunker::value_count_strategy(&d, Some(&indices), /* num_levels */ 8);
-        assert!(matches!(strategy, ValueCountStrategy::Sorted(_)));
-    }
-
-    #[test]
-    fn value_count_strategy_all_present_when_every_level_has_a_value() {
-        let d = descr(1, 1);
-        let indices = [0usize, 1, 2, 3];
-        let strategy =
-            ByteBudgetChunker::value_count_strategy(&d, Some(&indices), /* num_levels */ 4);
-        assert!(matches!(strategy, ValueCountStrategy::AllPresent));
-    }
-}
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
@@ -566,8 +566,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             self.props.write_batch_size()
         };
         let chunker = ByteBudgetChunker::new(&self.descr, &self.props, base_batch_size);
-        let value_count_strategy =
-            ByteBudgetChunker::value_count_strategy(&self.descr, value_indices, num_levels);
         while levels_offset < num_levels {
             let mut end_offset = num_levels.min(levels_offset + base_batch_size);
 
@@ -587,10 +585,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                 values,
                 value_indices,
                 chunk_def,
-                value_count_strategy,
                 values_offset,
                 chunk_size,
-                end_offset,
             );
 
             if sub_batch_size >= chunk_size {
@@ -5267,6 +5263,42 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_level_data_ref_value_count() {
+        // `value_count` is what the byte-budget chunker uses to convert a
+        // chunk's level span into a leaf-value count. It must work for any
+        // column shape — flat, nullable, or nested — because the leaf
+        // values array is decoupled from the rep/def level stream.
+        let max_def = 2;
+        // Non-nullable / unrepeated: no def levels materialized — every
+        // level is a value.
+        assert_eq!(LevelDataRef::Absent.value_count(64, max_def), 64);
+        // Uniform run of present values, and of nulls.
+        assert_eq!(
+            LevelDataRef::Uniform {
+                value: max_def,
+                count: 40
+            }
+            .value_count(40, max_def),
+            40
+        );
+        assert_eq!(
+            LevelDataRef::Uniform {
+                value: max_def - 1,
+                count: 40
+            }
+            .value_count(40, max_def),
+            0
+        );
+        // Materialized def levels (nullable / nested): only levels equal to
+        // `max_def` are values; empty-list / null levels are not.
+        let levels = [2i16, 0, 2, 1, 2, 2, 0];
+        assert_eq!(
+            LevelDataRef::Materialized(&levels).value_count(levels.len(), max_def),
+            4
+        );
+    }
+
     #[test]
     fn test_uniform_def_levels_all_null() {
         // All-null column: def_level=0 (null) for every slot, no values written.