Skip to content

Commit bead5a1

Browse files
committed
calculate cardinality estimate
1 parent 31ded68 commit bead5a1

File tree

1 file changed

+45
-11
lines changed

1 file changed

+45
-11
lines changed

iceberg-rust/src/file_format/parquet.rs

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,15 @@ pub fn parquet_to_datafile(
113113
Some(Value::Int(current_min)),
114114
Some(Value::Int(current_max)),
115115
) => {
116-
let overlap =
117-
range_overlap(&[current_min, current_max], &[&min, &max]).max(0);
118116
distinct_counts
119117
.entry(id)
120118
.and_modify(|x| {
121-
*x += ((1 - overlap as i64 / (max - min) as i64)
122-
* distinct_count as i64)
123-
as i64
119+
*x += estimate_distinct_count(
120+
&[current_min, current_max],
121+
&[&min, &max],
122+
*x,
123+
distinct_count as i64,
124+
);
124125
})
125126
.or_insert(distinct_count as i64);
126127
}
@@ -135,12 +136,6 @@ pub fn parquet_to_datafile(
135136
}
136137
_ => (),
137138
}
138-
if let Type::Primitive(_) = &data_type {
139-
distinct_counts
140-
.entry(id)
141-
.and_modify(|x| *x += distinct_count as i64)
142-
.or_insert(distinct_count as i64);
143-
}
144139
}
145140

146141
if let Some(min_bytes) = statistics.min_bytes_opt() {
@@ -326,3 +321,42 @@ fn range_overlap<T: Ord + Sub + Copy>(
326321
let overlap_end = (*old_range[1]).min(*new_range[1]);
327322
overlap_end - overlap_start
328323
}
324+
325+
fn estimate_distinct_count<T>(
326+
old_range: &[&T; 2],
327+
new_range: &[&T; 2],
328+
old_distinct_count: i64,
329+
new_distinct_count: i64,
330+
) -> i64
331+
where
332+
T: Ord + Sub<Output = T> + Copy + Into<f64> + Default,
333+
{
334+
let new_range_size: f64 = (*new_range[1] - *new_range[0]).into();
335+
let current_range_size: f64 = (*old_range[1] - *old_range[0]).into();
336+
let overlap = range_overlap(old_range, new_range);
337+
let overlap_size: f64 = if overlap >= T::default() {
338+
overlap.into()
339+
} else {
340+
0.0
341+
};
342+
let n2 = new_distinct_count as f64;
343+
let n1 = old_distinct_count as f64;
344+
345+
// Values outside overlap are definitely new
346+
let outside_overlap = ((new_range_size - overlap_size) / new_range_size * n2).max(0.0);
347+
348+
// For overlap region: estimate how many new values exist
349+
// using independence approximation: P(value not covered) = ((R-1)/R)^k
350+
// Expected new values in overlap = n2_overlap * ((R-1)/R)^(n1_overlap)
351+
let n2_overlap = (overlap_size / new_range_size * n2).max(0.0);
352+
let expected_n1_in_overlap = (overlap_size / current_range_size * n1).max(0.0);
353+
354+
let new_in_overlap = if overlap_size > 0.0 {
355+
let prob_not_covered = ((overlap_size - 1.0) / overlap_size).powf(expected_n1_in_overlap);
356+
n2_overlap * prob_not_covered
357+
} else {
358+
0.0
359+
};
360+
361+
(outside_overlap + new_in_overlap).round() as i64
362+
}

0 commit comments

Comments
 (0)