@@ -113,14 +113,15 @@ pub fn parquet_to_datafile(
113113 Some ( Value :: Int ( current_min) ) ,
114114 Some ( Value :: Int ( current_max) ) ,
115115 ) => {
116- let overlap =
117- range_overlap ( & [ current_min, current_max] , & [ & min, & max] ) . max ( 0 ) ;
118116 distinct_counts
119117 . entry ( id)
120118 . and_modify ( |x| {
121- * x += ( ( 1 - overlap as i64 / ( max - min) as i64 )
122- * distinct_count as i64 )
123- as i64
119+ * x += estimate_distinct_count (
120+ & [ current_min, current_max] ,
121+ & [ & min, & max] ,
122+ * x,
123+ distinct_count as i64 ,
124+ ) ;
124125 } )
125126 . or_insert ( distinct_count as i64 ) ;
126127 }
@@ -135,12 +136,6 @@ pub fn parquet_to_datafile(
135136 }
136137 _ => ( ) ,
137138 }
138- if let Type :: Primitive ( _) = & data_type {
139- distinct_counts
140- . entry ( id)
141- . and_modify ( |x| * x += distinct_count as i64 )
142- . or_insert ( distinct_count as i64 ) ;
143- }
144139 }
145140
146141 if let Some ( min_bytes) = statistics. min_bytes_opt ( ) {
@@ -326,3 +321,42 @@ fn range_overlap<T: Ord + Sub + Copy>(
326321 let overlap_end = ( * old_range[ 1 ] ) . min ( * new_range[ 1 ] ) ;
327322 overlap_end - overlap_start
328323}
324+
325+ fn estimate_distinct_count < T > (
326+ old_range : & [ & T ; 2 ] ,
327+ new_range : & [ & T ; 2 ] ,
328+ old_distinct_count : i64 ,
329+ new_distinct_count : i64 ,
330+ ) -> i64
331+ where
332+ T : Ord + Sub < Output = T > + Copy + Into < f64 > + Default ,
333+ {
334+ let new_range_size: f64 = ( * new_range[ 1 ] - * new_range[ 0 ] ) . into ( ) ;
335+ let current_range_size: f64 = ( * old_range[ 1 ] - * old_range[ 0 ] ) . into ( ) ;
336+ let overlap = range_overlap ( old_range, new_range) ;
337+ let overlap_size: f64 = if overlap >= T :: default ( ) {
338+ overlap. into ( )
339+ } else {
340+ 0.0
341+ } ;
342+ let n2 = new_distinct_count as f64 ;
343+ let n1 = old_distinct_count as f64 ;
344+
345+ // Values outside overlap are definitely new
346+ let outside_overlap = ( ( new_range_size - overlap_size) / new_range_size * n2) . max ( 0.0 ) ;
347+
348+ // For overlap region: estimate how many new values exist
349+ // using independence approximation: P(value not covered) = ((R-1)/R)^k
350+ // Expected new values in overlap = n2_overlap * ((R-1)/R)^(n1_overlap)
351+ let n2_overlap = ( overlap_size / new_range_size * n2) . max ( 0.0 ) ;
352+ let expected_n1_in_overlap = ( overlap_size / current_range_size * n1) . max ( 0.0 ) ;
353+
354+ let new_in_overlap = if overlap_size > 0.0 {
355+ let prob_not_covered = ( ( overlap_size - 1.0 ) / overlap_size) . powf ( expected_n1_in_overlap) ;
356+ n2_overlap * prob_not_covered
357+ } else {
358+ 0.0
359+ } ;
360+
361+ ( outside_overlap + new_in_overlap) . round ( ) as i64
362+ }
0 commit comments