@@ -322,6 +322,39 @@ fn range_overlap<T: Ord + Sub + Copy>(
322322 overlap_end - overlap_start
323323}
324324
325+ /// Estimates the number of new distinct values when merging two sets of statistics.
326+ ///
327+ /// This function assumes uniform distribution of distinct values within their respective ranges
328+ /// and uses an independence approximation to estimate overlap probability.
329+ ///
330+ /// # Algorithm
331+ ///
332+ /// The estimation is split into two parts:
333+ /// 1. **Non-overlapping region**: All values in the new range that fall outside the old range
334+ /// are guaranteed to be new.
335+ /// 2. **Overlapping region**: Uses the independence approximation:
336+ /// - P(specific value not covered) = ((R-1)/R)^k
337+ /// - where R is the overlap size and k is the expected number of old values in the overlap
338+ /// - Expected new values = n2_overlap × P(not covered)
339+ ///
340+ /// # Parameters
341+ ///
342+ /// * `old_range` - [min, max] of the existing value range
343+ /// * `new_range` - [min, max] of the new value range
344+ /// * `old_distinct_count` - Number of distinct values in the old range
345+ /// * `new_distinct_count` - Number of distinct values in the new range
346+ ///
347+ /// # Returns
348+ ///
349+ /// Estimated number of new distinct values to add to the running total
350+ ///
351+ /// # Example
352+ ///
353+ /// ```ignore
354+ /// // Old range [0, 1000] with 100 distinct values
355+ /// // New range [500, 1500] with 50 distinct values
356+ /// let new_count = estimate_distinct_count(&[&0, &1000], &[&500, &1500], 100, 50);
357+ /// ```
325358fn estimate_distinct_count < T > (
326359 old_range : & [ & T ; 2 ] ,
327360 new_range : & [ & T ; 2 ] ,
0 commit comments