Skip to content

Commit 2fb9808

Browse files
flaneur2020xxchan
andauthored
feat: Support metadata table "Manifests" (#861)
Co-authored-by: xxchan <[email protected]>
1 parent 328e18e commit 2fb9808

File tree

3 files changed

+242
-13
lines changed

3 files changed

+242
-13
lines changed

crates/iceberg/src/metadata_scan.rs

+240-11
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
2020
use std::sync::Arc;
2121

22-
use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder};
23-
use arrow_array::types::{Int64Type, TimestampMillisecondType};
22+
use arrow_array::builder::{
23+
BooleanBuilder, ListBuilder, MapBuilder, PrimitiveBuilder, StringBuilder, StructBuilder,
24+
};
25+
use arrow_array::types::{Int32Type, Int64Type, Int8Type, TimestampMillisecondType};
2426
use arrow_array::RecordBatch;
25-
use arrow_schema::{DataType, Field, Schema, TimeUnit};
27+
use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit};
2628

27-
use crate::spec::TableMetadata;
2829
use crate::table::Table;
2930
use crate::Result;
3031

@@ -45,19 +46,18 @@ impl MetadataTable {
4546

4647
/// Get the snapshots table.
4748
pub fn snapshots(&self) -> SnapshotsTable {
48-
SnapshotsTable {
49-
metadata_table: self,
50-
}
49+
SnapshotsTable { table: &self.0 }
5150
}
5251

53-
fn metadata(&self) -> &TableMetadata {
54-
self.0.metadata()
52+
/// Get the manifests table.
53+
pub fn manifests(&self) -> ManifestsTable {
54+
ManifestsTable { table: &self.0 }
5555
}
5656
}
5757

5858
/// Snapshots table.
5959
pub struct SnapshotsTable<'a> {
60-
metadata_table: &'a MetadataTable,
60+
table: &'a Table,
6161
}
6262

6363
impl<'a> SnapshotsTable<'a> {
@@ -104,7 +104,7 @@ impl<'a> SnapshotsTable<'a> {
104104
let mut manifest_list = StringBuilder::new();
105105
let mut summary = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
106106

107-
for snapshot in self.metadata_table.metadata().snapshots() {
107+
for snapshot in self.table.metadata().snapshots() {
108108
committed_at.append_value(snapshot.timestamp_ms());
109109
snapshot_id.append_value(snapshot.snapshot_id());
110110
parent_id.append_option(snapshot.parent_snapshot_id());
@@ -128,6 +128,133 @@ impl<'a> SnapshotsTable<'a> {
128128
}
129129
}
130130

131+
/// Manifests table.
132+
pub struct ManifestsTable<'a> {
133+
table: &'a Table,
134+
}
135+
136+
impl<'a> ManifestsTable<'a> {
137+
fn partition_summary_fields(&self) -> Vec<Field> {
138+
vec![
139+
Field::new("contains_null", DataType::Boolean, false),
140+
Field::new("contains_nan", DataType::Boolean, true),
141+
Field::new("lower_bound", DataType::Utf8, true),
142+
Field::new("upper_bound", DataType::Utf8, true),
143+
]
144+
}
145+
146+
/// Returns the schema of the manifests table.
147+
pub fn schema(&self) -> Schema {
148+
Schema::new(vec![
149+
Field::new("content", DataType::Int8, false),
150+
Field::new("path", DataType::Utf8, false),
151+
Field::new("length", DataType::Int64, false),
152+
Field::new("partition_spec_id", DataType::Int32, false),
153+
Field::new("added_snapshot_id", DataType::Int64, false),
154+
Field::new("added_data_files_count", DataType::Int32, false),
155+
Field::new("existing_data_files_count", DataType::Int32, false),
156+
Field::new("deleted_data_files_count", DataType::Int32, false),
157+
Field::new("added_delete_files_count", DataType::Int32, false),
158+
Field::new("existing_delete_files_count", DataType::Int32, false),
159+
Field::new("deleted_delete_files_count", DataType::Int32, false),
160+
Field::new(
161+
"partition_summaries",
162+
DataType::List(Arc::new(Field::new_struct(
163+
"item",
164+
self.partition_summary_fields(),
165+
false,
166+
))),
167+
false,
168+
),
169+
])
170+
}
171+
172+
/// Scans the manifests table.
173+
pub async fn scan(&self) -> Result<RecordBatch> {
174+
let mut content = PrimitiveBuilder::<Int8Type>::new();
175+
let mut path = StringBuilder::new();
176+
let mut length = PrimitiveBuilder::<Int64Type>::new();
177+
let mut partition_spec_id = PrimitiveBuilder::<Int32Type>::new();
178+
let mut added_snapshot_id = PrimitiveBuilder::<Int64Type>::new();
179+
let mut added_data_files_count = PrimitiveBuilder::<Int32Type>::new();
180+
let mut existing_data_files_count = PrimitiveBuilder::<Int32Type>::new();
181+
let mut deleted_data_files_count = PrimitiveBuilder::<Int32Type>::new();
182+
let mut added_delete_files_count = PrimitiveBuilder::<Int32Type>::new();
183+
let mut existing_delete_files_count = PrimitiveBuilder::<Int32Type>::new();
184+
let mut deleted_delete_files_count = PrimitiveBuilder::<Int32Type>::new();
185+
let mut partition_summaries = ListBuilder::new(StructBuilder::from_fields(
186+
Fields::from(self.partition_summary_fields()),
187+
0,
188+
))
189+
.with_field(Arc::new(Field::new_struct(
190+
"item",
191+
self.partition_summary_fields(),
192+
false,
193+
)));
194+
195+
if let Some(snapshot) = self.table.metadata().current_snapshot() {
196+
let manifest_list = snapshot
197+
.load_manifest_list(self.table.file_io(), &self.table.metadata_ref())
198+
.await?;
199+
for manifest in manifest_list.entries() {
200+
content.append_value(manifest.content as i8);
201+
path.append_value(manifest.manifest_path.clone());
202+
length.append_value(manifest.manifest_length);
203+
partition_spec_id.append_value(manifest.partition_spec_id);
204+
added_snapshot_id.append_value(manifest.added_snapshot_id);
205+
added_data_files_count.append_value(manifest.added_files_count.unwrap_or(0) as i32);
206+
existing_data_files_count
207+
.append_value(manifest.existing_files_count.unwrap_or(0) as i32);
208+
deleted_data_files_count
209+
.append_value(manifest.deleted_files_count.unwrap_or(0) as i32);
210+
added_delete_files_count
211+
.append_value(manifest.added_files_count.unwrap_or(0) as i32);
212+
existing_delete_files_count
213+
.append_value(manifest.existing_files_count.unwrap_or(0) as i32);
214+
deleted_delete_files_count
215+
.append_value(manifest.deleted_files_count.unwrap_or(0) as i32);
216+
217+
let partition_summaries_builder = partition_summaries.values();
218+
for summary in &manifest.partitions {
219+
partition_summaries_builder
220+
.field_builder::<BooleanBuilder>(0)
221+
.unwrap()
222+
.append_value(summary.contains_null);
223+
partition_summaries_builder
224+
.field_builder::<BooleanBuilder>(1)
225+
.unwrap()
226+
.append_option(summary.contains_nan);
227+
partition_summaries_builder
228+
.field_builder::<StringBuilder>(2)
229+
.unwrap()
230+
.append_option(summary.lower_bound.as_ref().map(|v| v.to_string()));
231+
partition_summaries_builder
232+
.field_builder::<StringBuilder>(3)
233+
.unwrap()
234+
.append_option(summary.upper_bound.as_ref().map(|v| v.to_string()));
235+
partition_summaries_builder.append(true);
236+
}
237+
partition_summaries.append(true);
238+
}
239+
}
240+
241+
Ok(RecordBatch::try_new(Arc::new(self.schema()), vec![
242+
Arc::new(content.finish()),
243+
Arc::new(path.finish()),
244+
Arc::new(length.finish()),
245+
Arc::new(partition_spec_id.finish()),
246+
Arc::new(added_snapshot_id.finish()),
247+
Arc::new(added_data_files_count.finish()),
248+
Arc::new(existing_data_files_count.finish()),
249+
Arc::new(deleted_data_files_count.finish()),
250+
Arc::new(added_delete_files_count.finish()),
251+
Arc::new(existing_delete_files_count.finish()),
252+
Arc::new(deleted_delete_files_count.finish()),
253+
Arc::new(partition_summaries.finish()),
254+
])?)
255+
}
256+
}
257+
131258
#[cfg(test)]
132259
mod tests {
133260
use expect_test::{expect, Expect};
@@ -253,4 +380,106 @@ mod tests {
253380
Some("committed_at"),
254381
);
255382
}
383+
384+
#[tokio::test]
385+
async fn test_manifests_table() {
386+
let mut fixture = TableTestFixture::new();
387+
fixture.setup_manifest_files().await;
388+
389+
let record_batch = fixture
390+
.table
391+
.metadata_table()
392+
.manifests()
393+
.scan()
394+
.await
395+
.unwrap();
396+
397+
check_record_batch(
398+
record_batch,
399+
expect![[r#"
400+
Field { name: "content", data_type: Int8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
401+
Field { name: "path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
402+
Field { name: "length", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
403+
Field { name: "partition_spec_id", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
404+
Field { name: "added_snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
405+
Field { name: "added_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
406+
Field { name: "existing_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
407+
Field { name: "deleted_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
408+
Field { name: "added_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
409+
Field { name: "existing_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
410+
Field { name: "deleted_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
411+
Field { name: "partition_summaries", data_type: List(Field { name: "item", data_type: Struct([Field { name: "contains_null", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "contains_nan", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"#]],
412+
expect![[r#"
413+
content: PrimitiveArray<Int8>
414+
[
415+
0,
416+
],
417+
path: (skipped),
418+
length: (skipped),
419+
partition_spec_id: PrimitiveArray<Int32>
420+
[
421+
0,
422+
],
423+
added_snapshot_id: PrimitiveArray<Int64>
424+
[
425+
3055729675574597004,
426+
],
427+
added_data_files_count: PrimitiveArray<Int32>
428+
[
429+
1,
430+
],
431+
existing_data_files_count: PrimitiveArray<Int32>
432+
[
433+
1,
434+
],
435+
deleted_data_files_count: PrimitiveArray<Int32>
436+
[
437+
1,
438+
],
439+
added_delete_files_count: PrimitiveArray<Int32>
440+
[
441+
1,
442+
],
443+
existing_delete_files_count: PrimitiveArray<Int32>
444+
[
445+
1,
446+
],
447+
deleted_delete_files_count: PrimitiveArray<Int32>
448+
[
449+
1,
450+
],
451+
partition_summaries: ListArray
452+
[
453+
StructArray
454+
-- validity:
455+
[
456+
valid,
457+
]
458+
[
459+
-- child 0: "contains_null" (Boolean)
460+
BooleanArray
461+
[
462+
false,
463+
]
464+
-- child 1: "contains_nan" (Boolean)
465+
BooleanArray
466+
[
467+
false,
468+
]
469+
-- child 2: "lower_bound" (Utf8)
470+
StringArray
471+
[
472+
"100",
473+
]
474+
-- child 3: "upper_bound" (Utf8)
475+
StringArray
476+
[
477+
"300",
478+
]
479+
],
480+
]"#]],
481+
&["path", "length"],
482+
Some("path"),
483+
);
484+
}
256485
}

crates/iceberg/src/scan.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1050,7 +1050,7 @@ pub mod tests {
10501050
.unwrap()
10511051
}
10521052

1053-
async fn setup_manifest_files(&mut self) {
1053+
pub async fn setup_manifest_files(&mut self) {
10541054
let current_snapshot = self.table.metadata().current_snapshot().unwrap();
10551055
let parent_snapshot = current_snapshot
10561056
.parent_snapshot(self.table.metadata())

crates/iceberg/src/spec/manifest_list.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ impl ManifestFile {
597597
}
598598

599599
/// The type of files tracked by the manifest, either data or delete files; Data(0) for all v1 manifests
600-
#[derive(Debug, PartialEq, Clone, Eq)]
600+
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
601601
pub enum ManifestContentType {
602602
/// The manifest content is data.
603603
Data = 0,

0 commit comments

Comments
 (0)