19
19
20
20
use std:: sync:: Arc ;
21
21
22
- use arrow_array:: builder:: { MapBuilder , PrimitiveBuilder , StringBuilder } ;
23
- use arrow_array:: types:: { Int64Type , TimestampMillisecondType } ;
22
+ use arrow_array:: builder:: {
23
+ BooleanBuilder , ListBuilder , MapBuilder , PrimitiveBuilder , StringBuilder , StructBuilder ,
24
+ } ;
25
+ use arrow_array:: types:: { Int32Type , Int64Type , Int8Type , TimestampMillisecondType } ;
24
26
use arrow_array:: RecordBatch ;
25
- use arrow_schema:: { DataType , Field , Schema , TimeUnit } ;
27
+ use arrow_schema:: { DataType , Field , Fields , Schema , TimeUnit } ;
26
28
27
- use crate :: spec:: TableMetadata ;
28
29
use crate :: table:: Table ;
29
30
use crate :: Result ;
30
31
@@ -45,19 +46,18 @@ impl MetadataTable {
45
46
46
47
/// Get the snapshots table.
47
48
pub fn snapshots ( & self ) -> SnapshotsTable {
48
- SnapshotsTable {
49
- metadata_table : self ,
50
- }
49
+ SnapshotsTable { table : & self . 0 }
51
50
}
52
51
53
- fn metadata ( & self ) -> & TableMetadata {
54
- self . 0 . metadata ( )
52
+ /// Get the manifests table.
53
+ pub fn manifests ( & self ) -> ManifestsTable {
54
+ ManifestsTable { table : & self . 0 }
55
55
}
56
56
}
57
57
58
58
/// Snapshots table.
59
59
pub struct SnapshotsTable < ' a > {
60
- metadata_table : & ' a MetadataTable ,
60
+ table : & ' a Table ,
61
61
}
62
62
63
63
impl < ' a > SnapshotsTable < ' a > {
@@ -104,7 +104,7 @@ impl<'a> SnapshotsTable<'a> {
104
104
let mut manifest_list = StringBuilder :: new ( ) ;
105
105
let mut summary = MapBuilder :: new ( None , StringBuilder :: new ( ) , StringBuilder :: new ( ) ) ;
106
106
107
- for snapshot in self . metadata_table . metadata ( ) . snapshots ( ) {
107
+ for snapshot in self . table . metadata ( ) . snapshots ( ) {
108
108
committed_at. append_value ( snapshot. timestamp_ms ( ) ) ;
109
109
snapshot_id. append_value ( snapshot. snapshot_id ( ) ) ;
110
110
parent_id. append_option ( snapshot. parent_snapshot_id ( ) ) ;
@@ -128,6 +128,133 @@ impl<'a> SnapshotsTable<'a> {
128
128
}
129
129
}
130
130
131
+ /// Manifests table.
132
+ pub struct ManifestsTable < ' a > {
133
+ table : & ' a Table ,
134
+ }
135
+
136
+ impl < ' a > ManifestsTable < ' a > {
137
+ fn partition_summary_fields ( & self ) -> Vec < Field > {
138
+ vec ! [
139
+ Field :: new( "contains_null" , DataType :: Boolean , false ) ,
140
+ Field :: new( "contains_nan" , DataType :: Boolean , true ) ,
141
+ Field :: new( "lower_bound" , DataType :: Utf8 , true ) ,
142
+ Field :: new( "upper_bound" , DataType :: Utf8 , true ) ,
143
+ ]
144
+ }
145
+
146
+ /// Returns the schema of the manifests table.
147
+ pub fn schema ( & self ) -> Schema {
148
+ Schema :: new ( vec ! [
149
+ Field :: new( "content" , DataType :: Int8 , false ) ,
150
+ Field :: new( "path" , DataType :: Utf8 , false ) ,
151
+ Field :: new( "length" , DataType :: Int64 , false ) ,
152
+ Field :: new( "partition_spec_id" , DataType :: Int32 , false ) ,
153
+ Field :: new( "added_snapshot_id" , DataType :: Int64 , false ) ,
154
+ Field :: new( "added_data_files_count" , DataType :: Int32 , false ) ,
155
+ Field :: new( "existing_data_files_count" , DataType :: Int32 , false ) ,
156
+ Field :: new( "deleted_data_files_count" , DataType :: Int32 , false ) ,
157
+ Field :: new( "added_delete_files_count" , DataType :: Int32 , false ) ,
158
+ Field :: new( "existing_delete_files_count" , DataType :: Int32 , false ) ,
159
+ Field :: new( "deleted_delete_files_count" , DataType :: Int32 , false ) ,
160
+ Field :: new(
161
+ "partition_summaries" ,
162
+ DataType :: List ( Arc :: new( Field :: new_struct(
163
+ "item" ,
164
+ self . partition_summary_fields( ) ,
165
+ false ,
166
+ ) ) ) ,
167
+ false ,
168
+ ) ,
169
+ ] )
170
+ }
171
+
172
+ /// Scans the manifests table.
173
+ pub async fn scan ( & self ) -> Result < RecordBatch > {
174
+ let mut content = PrimitiveBuilder :: < Int8Type > :: new ( ) ;
175
+ let mut path = StringBuilder :: new ( ) ;
176
+ let mut length = PrimitiveBuilder :: < Int64Type > :: new ( ) ;
177
+ let mut partition_spec_id = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
178
+ let mut added_snapshot_id = PrimitiveBuilder :: < Int64Type > :: new ( ) ;
179
+ let mut added_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
180
+ let mut existing_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
181
+ let mut deleted_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
182
+ let mut added_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
183
+ let mut existing_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
184
+ let mut deleted_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
185
+ let mut partition_summaries = ListBuilder :: new ( StructBuilder :: from_fields (
186
+ Fields :: from ( self . partition_summary_fields ( ) ) ,
187
+ 0 ,
188
+ ) )
189
+ . with_field ( Arc :: new ( Field :: new_struct (
190
+ "item" ,
191
+ self . partition_summary_fields ( ) ,
192
+ false ,
193
+ ) ) ) ;
194
+
195
+ if let Some ( snapshot) = self . table . metadata ( ) . current_snapshot ( ) {
196
+ let manifest_list = snapshot
197
+ . load_manifest_list ( self . table . file_io ( ) , & self . table . metadata_ref ( ) )
198
+ . await ?;
199
+ for manifest in manifest_list. entries ( ) {
200
+ content. append_value ( manifest. content as i8 ) ;
201
+ path. append_value ( manifest. manifest_path . clone ( ) ) ;
202
+ length. append_value ( manifest. manifest_length ) ;
203
+ partition_spec_id. append_value ( manifest. partition_spec_id ) ;
204
+ added_snapshot_id. append_value ( manifest. added_snapshot_id ) ;
205
+ added_data_files_count. append_value ( manifest. added_files_count . unwrap_or ( 0 ) as i32 ) ;
206
+ existing_data_files_count
207
+ . append_value ( manifest. existing_files_count . unwrap_or ( 0 ) as i32 ) ;
208
+ deleted_data_files_count
209
+ . append_value ( manifest. deleted_files_count . unwrap_or ( 0 ) as i32 ) ;
210
+ added_delete_files_count
211
+ . append_value ( manifest. added_files_count . unwrap_or ( 0 ) as i32 ) ;
212
+ existing_delete_files_count
213
+ . append_value ( manifest. existing_files_count . unwrap_or ( 0 ) as i32 ) ;
214
+ deleted_delete_files_count
215
+ . append_value ( manifest. deleted_files_count . unwrap_or ( 0 ) as i32 ) ;
216
+
217
+ let partition_summaries_builder = partition_summaries. values ( ) ;
218
+ for summary in & manifest. partitions {
219
+ partition_summaries_builder
220
+ . field_builder :: < BooleanBuilder > ( 0 )
221
+ . unwrap ( )
222
+ . append_value ( summary. contains_null ) ;
223
+ partition_summaries_builder
224
+ . field_builder :: < BooleanBuilder > ( 1 )
225
+ . unwrap ( )
226
+ . append_option ( summary. contains_nan ) ;
227
+ partition_summaries_builder
228
+ . field_builder :: < StringBuilder > ( 2 )
229
+ . unwrap ( )
230
+ . append_option ( summary. lower_bound . as_ref ( ) . map ( |v| v. to_string ( ) ) ) ;
231
+ partition_summaries_builder
232
+ . field_builder :: < StringBuilder > ( 3 )
233
+ . unwrap ( )
234
+ . append_option ( summary. upper_bound . as_ref ( ) . map ( |v| v. to_string ( ) ) ) ;
235
+ partition_summaries_builder. append ( true ) ;
236
+ }
237
+ partition_summaries. append ( true ) ;
238
+ }
239
+ }
240
+
241
+ Ok ( RecordBatch :: try_new ( Arc :: new ( self . schema ( ) ) , vec ! [
242
+ Arc :: new( content. finish( ) ) ,
243
+ Arc :: new( path. finish( ) ) ,
244
+ Arc :: new( length. finish( ) ) ,
245
+ Arc :: new( partition_spec_id. finish( ) ) ,
246
+ Arc :: new( added_snapshot_id. finish( ) ) ,
247
+ Arc :: new( added_data_files_count. finish( ) ) ,
248
+ Arc :: new( existing_data_files_count. finish( ) ) ,
249
+ Arc :: new( deleted_data_files_count. finish( ) ) ,
250
+ Arc :: new( added_delete_files_count. finish( ) ) ,
251
+ Arc :: new( existing_delete_files_count. finish( ) ) ,
252
+ Arc :: new( deleted_delete_files_count. finish( ) ) ,
253
+ Arc :: new( partition_summaries. finish( ) ) ,
254
+ ] ) ?)
255
+ }
256
+ }
257
+
131
258
#[ cfg( test) ]
132
259
mod tests {
133
260
use expect_test:: { expect, Expect } ;
@@ -253,4 +380,106 @@ mod tests {
253
380
Some ( "committed_at" ) ,
254
381
) ;
255
382
}
383
+
384
+ #[ tokio:: test]
385
+ async fn test_manifests_table ( ) {
386
+ let mut fixture = TableTestFixture :: new ( ) ;
387
+ fixture. setup_manifest_files ( ) . await ;
388
+
389
+ let record_batch = fixture
390
+ . table
391
+ . metadata_table ( )
392
+ . manifests ( )
393
+ . scan ( )
394
+ . await
395
+ . unwrap ( ) ;
396
+
397
+ check_record_batch (
398
+ record_batch,
399
+ expect ! [ [ r#"
400
+ Field { name: "content", data_type: Int8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
401
+ Field { name: "path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
402
+ Field { name: "length", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
403
+ Field { name: "partition_spec_id", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
404
+ Field { name: "added_snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
405
+ Field { name: "added_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
406
+ Field { name: "existing_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
407
+ Field { name: "deleted_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
408
+ Field { name: "added_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
409
+ Field { name: "existing_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
410
+ Field { name: "deleted_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
411
+ Field { name: "partition_summaries", data_type: List(Field { name: "item", data_type: Struct([Field { name: "contains_null", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "contains_nan", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"# ] ] ,
412
+ expect ! [ [ r#"
413
+ content: PrimitiveArray<Int8>
414
+ [
415
+ 0,
416
+ ],
417
+ path: (skipped),
418
+ length: (skipped),
419
+ partition_spec_id: PrimitiveArray<Int32>
420
+ [
421
+ 0,
422
+ ],
423
+ added_snapshot_id: PrimitiveArray<Int64>
424
+ [
425
+ 3055729675574597004,
426
+ ],
427
+ added_data_files_count: PrimitiveArray<Int32>
428
+ [
429
+ 1,
430
+ ],
431
+ existing_data_files_count: PrimitiveArray<Int32>
432
+ [
433
+ 1,
434
+ ],
435
+ deleted_data_files_count: PrimitiveArray<Int32>
436
+ [
437
+ 1,
438
+ ],
439
+ added_delete_files_count: PrimitiveArray<Int32>
440
+ [
441
+ 1,
442
+ ],
443
+ existing_delete_files_count: PrimitiveArray<Int32>
444
+ [
445
+ 1,
446
+ ],
447
+ deleted_delete_files_count: PrimitiveArray<Int32>
448
+ [
449
+ 1,
450
+ ],
451
+ partition_summaries: ListArray
452
+ [
453
+ StructArray
454
+ -- validity:
455
+ [
456
+ valid,
457
+ ]
458
+ [
459
+ -- child 0: "contains_null" (Boolean)
460
+ BooleanArray
461
+ [
462
+ false,
463
+ ]
464
+ -- child 1: "contains_nan" (Boolean)
465
+ BooleanArray
466
+ [
467
+ false,
468
+ ]
469
+ -- child 2: "lower_bound" (Utf8)
470
+ StringArray
471
+ [
472
+ "100",
473
+ ]
474
+ -- child 3: "upper_bound" (Utf8)
475
+ StringArray
476
+ [
477
+ "300",
478
+ ]
479
+ ],
480
+ ]"# ] ] ,
481
+ & [ "path" , "length" ] ,
482
+ Some ( "path" ) ,
483
+ ) ;
484
+ }
256
485
}
0 commit comments