apache · kosiew · Jun 17, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
@@ -184,6 +184,26 @@ fn hash_array<T>(
     }
 }
 
+/// Helper function to update hash for a dictionary key if the value is valid
+#[cfg(not(feature = "force_hash_collisions"))]
+#[inline]
+fn update_hash_for_dict_key(
+    hash: &mut u64,
+    dict_hashes: &[u64],
+    dict_values: &dyn Array,
+    idx: usize,
+    multi_col: bool,
+) {
+    if dict_values.is_valid(idx) {
+        if multi_col {
+            *hash = combine_hashes(dict_hashes[idx], *hash);
+        } else {
+            *hash = dict_hashes[idx];
+        }
+    }
+    // no update for invalid dictionary value
+}
+
 /// Hash the values in a dictionary array
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_dictionary<K: ArrowDictionaryKeyType>(
@@ -195,23 +215,23 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>(
     // Hash each dictionary value once, and then use that computed
     // hash for each key value to avoid a potentially expensive
     // redundant hashing for large dictionary elements (e.g. strings)
-    let values = Arc::clone(array.values());
-    let mut dict_hashes = vec![0; values.len()];
-    create_hashes(&[values], random_state, &mut dict_hashes)?;
+    let dict_values = Arc::clone(array.values());
+    let mut dict_hashes = vec![0; dict_values.len()];
+    create_hashes(&[dict_values], random_state, &mut dict_hashes)?;
 
     // combine hash for each index in values
-    if multi_col {
-        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
-            if let Some(key) = key {
-                *hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
-            } // no update for Null, consistent with other hashes
-        }
-    } else {
-        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
-            if let Some(key) = key {
-                *hash = dict_hashes[key.as_usize()]
-            } // no update for Null, consistent with other hashes
-        }
+    let dict_values = array.values();
+    for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
+        if let Some(key) = key {
+            let idx = key.as_usize();
+            update_hash_for_dict_key(
+                hash,
+                &dict_hashes,
+                dict_values.as_ref(),
+                idx,
+                multi_col,
+            );
+        } // no update for Null key
     }
     Ok(())
 }

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
@@ -171,7 +171,7 @@ impl AggregationFuzzer {
             let datasets = self
                 .dataset_generator
                 .generate()
-                .expect("should success to generate dataset");
+                .expect("should succeed to generate dataset");
 
             // Then for each of them, we random select a test sql for it
             let query_groups = datasets
@@ -216,16 +216,16 @@ impl AggregationFuzzer {
             // Generate the baseline context, and get the baseline result firstly
             let baseline_ctx_with_params = ctx_generator
                 .generate_baseline()
-                .expect("should success to generate baseline session context");
+                .expect("should succeed to generate baseline session context");
             let baseline_result = run_sql(&sql, &baseline_ctx_with_params.ctx)
                 .await
-                .expect("should success to run baseline sql");
+                .expect("should succeed to run baseline sql");
             let baseline_result = Arc::new(baseline_result);
             // Generate test tasks
             for _ in 0..CTX_GEN_ROUNDS {
                 let ctx_with_params = ctx_generator
                     .generate()
-                    .expect("should success to generate session context");
+                    .expect("should succeed to generate session context");
                 let task = AggregationFuzzTestTask {
                     dataset_ref: dataset_ref.clone(),
                     expected_result: baseline_result.clone(),

diff --git a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
@@ -724,15 +724,13 @@ impl RecordBatchGenerator {
             {
                 // We generate just num_distinct values because they will be reused by different keys
                 let mut array_gen_rng = array_gen_rng;
-
+                debug_assert!((0.0..=1.0).contains(&null_pct));
                 let values = Self::generate_array_of_type_inner(
                     &ColumnDescr::new("values", *value_type.clone()),
                     num_distinct,
                     batch_gen_rng,
                     array_gen_rng.clone(),
-                    // Once https://github.com/apache/datafusion/issues/16228 is fixed
-                    // we can also generate nulls in values
-                    0.0, // null values are generated on the key level
+                    null_pct, // generate some null values
                 );
 
                 match key_type.as_ref() {