@@ -25,17 +25,22 @@ use iceberg::spec::{
25
25
DataFile , ManifestEntry , ManifestStatus , NestedField , PrimitiveType , Schema , Type ,
26
26
} ;
27
27
use iceberg:: table:: Table ;
28
- use iceberg:: transaction:: { Transaction , MANIFEST_MERGE_ENABLED , MANIFEST_MIN_MERGE_COUNT } ;
28
+ use iceberg:: transaction:: {
29
+ Transaction , MANIFEST_MERGE_ENABLED , MANIFEST_MIN_MERGE_COUNT , MANIFEST_TARGET_SIZE_BYTES ,
30
+ } ;
29
31
use iceberg:: writer:: base_writer:: data_file_writer:: DataFileWriterBuilder ;
30
32
use iceberg:: writer:: file_writer:: location_generator:: {
31
33
DefaultFileNameGenerator , DefaultLocationGenerator ,
32
34
} ;
33
35
use iceberg:: writer:: file_writer:: ParquetWriterBuilder ;
34
36
use iceberg:: writer:: { IcebergWriter , IcebergWriterBuilder } ;
35
- use iceberg:: { Catalog , Namespace , NamespaceIdent , TableCreation } ;
36
- use iceberg_integration_tests :: set_test_fixture ;
37
+ use iceberg:: { Catalog , TableCreation } ;
38
+ use iceberg_catalog_rest :: RestCatalog ;
37
39
use parquet:: file:: properties:: WriterProperties ;
38
40
41
+ use crate :: get_shared_containers;
42
+ use crate :: shared_tests:: random_ns;
43
+
39
44
async fn write_new_data_file ( table : & Table ) -> Vec < DataFile > {
40
45
let schema: Arc < arrow_schema:: Schema > = Arc :: new (
41
46
table
@@ -60,9 +65,9 @@ async fn write_new_data_file(table: &Table) -> Vec<DataFile> {
60
65
) ;
61
66
let data_file_writer_builder = DataFileWriterBuilder :: new ( parquet_writer_builder, None ) ;
62
67
let mut data_file_writer = data_file_writer_builder. build ( ) . await . unwrap ( ) ;
63
- let col1 = StringArray :: from ( vec ! [ Some ( "foo" ) , Some ( "bar" ) , None , Some ( "baz" ) ] ) ;
64
- let col2 = Int32Array :: from ( vec ! [ Some ( 1 ) , Some ( 2 ) , Some ( 3 ) , Some ( 4 ) ] ) ;
65
- let col3 = BooleanArray :: from ( vec ! [ Some ( true ) , Some ( false ) , None , Some ( false ) ] ) ;
68
+ let col1 = StringArray :: from ( vec ! [ Some ( "foo" ) ; 100 ] ) ;
69
+ let col2 = Int32Array :: from ( vec ! [ Some ( 1 ) ; 100 ] ) ;
70
+ let col3 = BooleanArray :: from ( vec ! [ Some ( true ) ; 100 ] ) ;
66
71
let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [
67
72
Arc :: new( col1) as ArrayRef ,
68
73
Arc :: new( col2) as ArrayRef ,
@@ -75,21 +80,10 @@ async fn write_new_data_file(table: &Table) -> Vec<DataFile> {
75
80
76
81
#[ tokio:: test]
77
82
async fn test_append_data_file ( ) {
78
- let fixture = set_test_fixture ( "test_create_table" ) . await ;
83
+ let fixture = get_shared_containers ( ) ;
84
+ let rest_catalog = RestCatalog :: new ( fixture. catalog_config . clone ( ) ) ;
85
+ let ns = random_ns ( ) . await ;
79
86
80
- // Create table
81
- let ns = Namespace :: with_properties (
82
- NamespaceIdent :: from_strs ( [ "apple" , "ios" ] ) . unwrap ( ) ,
83
- HashMap :: from ( [
84
- ( "owner" . to_string ( ) , "ray" . to_string ( ) ) ,
85
- ( "community" . to_string ( ) , "apache" . to_string ( ) ) ,
86
- ] ) ,
87
- ) ;
88
- fixture
89
- . rest_catalog
90
- . create_namespace ( ns. name ( ) , ns. properties ( ) . clone ( ) )
91
- . await
92
- . unwrap ( ) ;
93
87
let schema = Schema :: builder ( )
94
88
. with_schema_id ( 1 )
95
89
. with_identifier_field_ids ( vec ! [ 2 ] )
@@ -104,8 +98,7 @@ async fn test_append_data_file() {
104
98
. name ( "t1" . to_string ( ) )
105
99
. schema ( schema. clone ( ) )
106
100
. build ( ) ;
107
- let mut table = fixture
108
- . rest_catalog
101
+ let mut table = rest_catalog
109
102
. create_table ( ns. name ( ) , table_creation)
110
103
. await
111
104
. unwrap ( ) ;
@@ -116,9 +109,10 @@ async fn test_append_data_file() {
116
109
. set_properties ( HashMap :: from ( [
117
110
( MANIFEST_MERGE_ENABLED . to_string ( ) , "true" . to_string ( ) ) ,
118
111
( MANIFEST_MIN_MERGE_COUNT . to_string ( ) , "4" . to_string ( ) ) ,
112
+ ( MANIFEST_TARGET_SIZE_BYTES . to_string ( ) , "7000" . to_string ( ) ) ,
119
113
] ) )
120
114
. unwrap ( )
121
- . commit ( & fixture . rest_catalog )
115
+ . commit ( & rest_catalog)
122
116
. await
123
117
. unwrap ( ) ;
124
118
@@ -130,7 +124,7 @@ async fn test_append_data_file() {
130
124
let mut append_action = tx. fast_append ( None , vec ! [ ] ) . unwrap ( ) ;
131
125
append_action. add_data_files ( data_file. clone ( ) ) . unwrap ( ) ;
132
126
let tx = append_action. apply ( ) . await . unwrap ( ) ;
133
- table = tx. commit ( & fixture . rest_catalog ) . await . unwrap ( )
127
+ table = tx. commit ( & rest_catalog) . await . unwrap ( )
134
128
}
135
129
let manifest_list = table
136
130
. metadata ( )
@@ -140,44 +134,73 @@ async fn test_append_data_file() {
140
134
. await
141
135
. unwrap ( ) ;
142
136
assert_eq ! ( manifest_list. entries( ) . len( ) , 3 ) ;
143
- for entry in manifest_list. entries ( ) {
137
+
138
+ // construct test data
139
+ for ( idx, entry) in manifest_list. entries ( ) . iter ( ) . enumerate ( ) {
144
140
let manifest = entry. load_manifest ( table. file_io ( ) ) . await . unwrap ( ) ;
145
141
assert ! ( manifest. entries( ) . len( ) == 1 ) ;
146
142
147
- original_manifest_entries. push ( Arc :: new (
148
- ManifestEntry :: builder ( )
149
- . status ( ManifestStatus :: Existing )
150
- . snapshot_id ( manifest. entries ( ) [ 0 ] . snapshot_id ( ) . unwrap ( ) )
151
- . sequence_number ( manifest. entries ( ) [ 0 ] . sequence_number ( ) . unwrap ( ) )
152
- . file_sequence_number ( manifest. entries ( ) [ 0 ] . file_sequence_number ( ) . unwrap ( ) )
153
- . data_file ( manifest. entries ( ) [ 0 ] . data_file ( ) . clone ( ) )
154
- . build ( ) ,
155
- ) ) ;
143
+ // For this first manifest, it will be pack with the first additional manifest and
144
+ // the count(2) is less than the min merge count(4), so these two will not merge.
145
+ // See detail: `MergeManifestProcess::merge_group`
146
+ if idx == 0 {
147
+ original_manifest_entries. push ( Arc :: new (
148
+ ManifestEntry :: builder ( )
149
+ . status ( ManifestStatus :: Added )
150
+ . snapshot_id ( manifest. entries ( ) [ 0 ] . snapshot_id ( ) . unwrap ( ) )
151
+ . sequence_number ( manifest. entries ( ) [ 0 ] . sequence_number ( ) . unwrap ( ) )
152
+ . file_sequence_number ( manifest. entries ( ) [ 0 ] . file_sequence_number ( ) . unwrap ( ) )
153
+ . data_file ( manifest. entries ( ) [ 0 ] . data_file ( ) . clone ( ) )
154
+ . build ( ) ,
155
+ ) ) ;
156
+ } else {
157
+ original_manifest_entries. push ( Arc :: new (
158
+ ManifestEntry :: builder ( )
159
+ . status ( ManifestStatus :: Existing )
160
+ . snapshot_id ( manifest. entries ( ) [ 0 ] . snapshot_id ( ) . unwrap ( ) )
161
+ . sequence_number ( manifest. entries ( ) [ 0 ] . sequence_number ( ) . unwrap ( ) )
162
+ . file_sequence_number ( manifest. entries ( ) [ 0 ] . file_sequence_number ( ) . unwrap ( ) )
163
+ . data_file ( manifest. entries ( ) [ 0 ] . data_file ( ) . clone ( ) )
164
+ . build ( ) ,
165
+ ) ) ;
166
+ }
156
167
}
157
168
158
- // append data file with merge append, 4 data file will be merged to one manifest
169
+ // append data file with merge append, 4 data file will be merged to two manifest
159
170
let data_file = write_new_data_file ( & table) . await ;
160
171
let tx = Transaction :: new ( & table) ;
161
172
let mut merge_append_action = tx. merge_append ( None , vec ! [ ] ) . unwrap ( ) ;
162
173
merge_append_action
163
174
. add_data_files ( data_file. clone ( ) )
164
175
. unwrap ( ) ;
165
176
let tx = merge_append_action. apply ( ) . await . unwrap ( ) ;
166
- table = tx. commit ( & fixture. rest_catalog ) . await . unwrap ( ) ;
177
+ table = tx. commit ( & rest_catalog) . await . unwrap ( ) ;
178
+ // Check manifest file
167
179
let manifest_list = table
168
180
. metadata ( )
169
181
. current_snapshot ( )
170
182
. unwrap ( )
171
183
. load_manifest_list ( table. file_io ( ) , table. metadata ( ) )
172
184
. await
173
185
. unwrap ( ) ;
174
- assert_eq ! ( manifest_list. entries( ) . len( ) , 1 ) ;
175
- let manifest = manifest_list. entries ( ) [ 0 ]
176
- . load_manifest ( table. file_io ( ) )
177
- . await
178
- . unwrap ( ) ;
179
- assert ! ( manifest. entries( ) . len( ) == 4 ) ;
180
- for original_entry in original_manifest_entries. iter ( ) {
181
- assert ! ( manifest. entries( ) . contains( original_entry) ) ;
186
+ assert_eq ! ( manifest_list. entries( ) . len( ) , 3 ) ;
187
+ {
188
+ let manifest = manifest_list. entries ( ) [ 1 ]
189
+ . load_manifest ( table. file_io ( ) )
190
+ . await
191
+ . unwrap ( ) ;
192
+ assert ! ( manifest. entries( ) . len( ) == 1 ) ;
193
+ original_manifest_entries. retain ( |entry| !manifest. entries ( ) . contains ( entry) ) ;
194
+ assert ! ( original_manifest_entries. len( ) == 2 ) ;
195
+ }
196
+ {
197
+ let manifest = manifest_list. entries ( ) [ 2 ]
198
+ . load_manifest ( table. file_io ( ) )
199
+ . await
200
+ . unwrap ( ) ;
201
+ assert ! ( manifest. entries( ) . len( ) == 2 ) ;
202
+ for original_entry in original_manifest_entries. iter ( ) {
203
+ assert ! ( manifest. entries( ) . contains( original_entry) ) ;
204
+ }
182
205
}
183
206
}
0 commit comments