Skip to content

Commit 4fba3f4

Browse files
ZENOTMEZENOTME
and
ZENOTME
authored
- fix equality delete writer field id project (#751)
- fix decimal of parquet statis parse Co-authored-by: ZENOTME <[email protected]>
1 parent 1798b30 commit 4fba3f4

File tree

3 files changed

+208
-5
lines changed

3 files changed

+208
-5
lines changed

crates/iceberg/src/arrow/schema.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -723,10 +723,21 @@ macro_rules! get_parquet_stat_as_datum {
723723
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
724724
return Ok(None);
725725
};
726-
727726
Some(Datum::new(
728727
primitive_type.clone(),
729-
PrimitiveLiteral::Int128(i128::from_le_bytes(bytes.try_into()?)),
728+
PrimitiveLiteral::Int128(i128::from_be_bytes(bytes.try_into()?)),
729+
))
730+
}
731+
(PrimitiveType::Decimal {
732+
precision: _,
733+
scale: _,
734+
}, Statistics::FixedLenByteArray(stats)) => {
735+
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
736+
return Ok(None);
737+
};
738+
Some(Datum::new(
739+
primitive_type.clone(),
740+
PrimitiveLiteral::Int128(i128::from_be_bytes(bytes.try_into()?)),
730741
))
731742
}
732743
(

crates/iceberg/src/writer/base_writer/equality_delete_writer.rs

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ impl EqualityDeleteWriterConfig {
7171
|field| {
7272
// Only primitive type is allowed to be used for identifier field ids
7373
if field.is_nullable()
74-
|| !field.data_type().is_primitive()
74+
|| field.data_type().is_nested()
7575
|| matches!(
7676
field.data_type(),
7777
DataType::Float16 | DataType::Float32 | DataType::Float64
@@ -169,13 +169,14 @@ mod test {
169169
use std::sync::Arc;
170170

171171
use arrow_array::types::Int32Type;
172-
use arrow_array::{ArrayRef, Int32Array, RecordBatch, StructArray};
172+
use arrow_array::{ArrayRef, BooleanArray, Int32Array, Int64Array, RecordBatch, StructArray};
173173
use arrow_schema::DataType;
174174
use arrow_select::concat::concat_batches;
175175
use itertools::Itertools;
176176
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
177177
use parquet::file::properties::WriterProperties;
178178
use tempfile::TempDir;
179+
use uuid::Uuid;
179180

180181
use crate::arrow::{arrow_schema_to_schema, schema_to_arrow_schema};
181182
use crate::io::{FileIO, FileIOBuilder};
@@ -500,4 +501,159 @@ mod test {
500501

501502
Ok(())
502503
}
504+
505+
#[tokio::test]
506+
async fn test_equality_delete_with_primitive_type() -> Result<(), anyhow::Error> {
507+
let temp_dir = TempDir::new().unwrap();
508+
let file_io = FileIOBuilder::new_fs_io().build().unwrap();
509+
let location_gen =
510+
MockLocationGenerator::new(temp_dir.path().to_str().unwrap().to_string());
511+
let file_name_gen =
512+
DefaultFileNameGenerator::new("test".to_string(), None, DataFileFormat::Parquet);
513+
514+
let schema = Arc::new(
515+
Schema::builder()
516+
.with_schema_id(1)
517+
.with_fields(vec![
518+
NestedField::required(0, "col0", Type::Primitive(PrimitiveType::Boolean))
519+
.into(),
520+
NestedField::required(1, "col1", Type::Primitive(PrimitiveType::Int)).into(),
521+
NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Long)).into(),
522+
NestedField::required(
523+
3,
524+
"col3",
525+
Type::Primitive(PrimitiveType::Decimal {
526+
precision: 38,
527+
scale: 5,
528+
}),
529+
)
530+
.into(),
531+
NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Date)).into(),
532+
NestedField::required(5, "col5", Type::Primitive(PrimitiveType::Time)).into(),
533+
NestedField::required(6, "col6", Type::Primitive(PrimitiveType::Timestamp))
534+
.into(),
535+
NestedField::required(7, "col7", Type::Primitive(PrimitiveType::Timestamptz))
536+
.into(),
537+
NestedField::required(8, "col8", Type::Primitive(PrimitiveType::TimestampNs))
538+
.into(),
539+
NestedField::required(9, "col9", Type::Primitive(PrimitiveType::TimestamptzNs))
540+
.into(),
541+
NestedField::required(10, "col10", Type::Primitive(PrimitiveType::String))
542+
.into(),
543+
NestedField::required(11, "col11", Type::Primitive(PrimitiveType::Uuid)).into(),
544+
NestedField::required(12, "col12", Type::Primitive(PrimitiveType::Fixed(10)))
545+
.into(),
546+
NestedField::required(13, "col13", Type::Primitive(PrimitiveType::Binary))
547+
.into(),
548+
])
549+
.build()
550+
.unwrap(),
551+
);
552+
let equality_ids = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
553+
let config = EqualityDeleteWriterConfig::new(equality_ids, schema.clone(), None).unwrap();
554+
let delete_arrow_schema = config.projected_arrow_schema_ref().clone();
555+
let delete_schema = arrow_schema_to_schema(&delete_arrow_schema).unwrap();
556+
557+
let pb = ParquetWriterBuilder::new(
558+
WriterProperties::builder().build(),
559+
Arc::new(delete_schema),
560+
file_io.clone(),
561+
location_gen,
562+
file_name_gen,
563+
);
564+
let mut equality_delete_writer = EqualityDeleteFileWriterBuilder::new(pb)
565+
.build(config)
566+
.await?;
567+
568+
// prepare data
569+
let col0 = Arc::new(BooleanArray::from(vec![
570+
Some(true),
571+
Some(false),
572+
Some(true),
573+
])) as ArrayRef;
574+
let col1 = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(4)])) as ArrayRef;
575+
let col2 = Arc::new(Int64Array::from(vec![Some(1), Some(2), Some(4)])) as ArrayRef;
576+
let col3 = Arc::new(
577+
arrow_array::Decimal128Array::from(vec![Some(1), Some(2), Some(4)])
578+
.with_precision_and_scale(38, 5)
579+
.unwrap(),
580+
) as ArrayRef;
581+
let col4 = Arc::new(arrow_array::Date32Array::from(vec![
582+
Some(0),
583+
Some(1),
584+
Some(3),
585+
])) as ArrayRef;
586+
let col5 = Arc::new(arrow_array::Time64MicrosecondArray::from(vec![
587+
Some(0),
588+
Some(1),
589+
Some(3),
590+
])) as ArrayRef;
591+
let col6 = Arc::new(arrow_array::TimestampMicrosecondArray::from(vec![
592+
Some(0),
593+
Some(1),
594+
Some(3),
595+
])) as ArrayRef;
596+
let col7 = Arc::new(
597+
arrow_array::TimestampMicrosecondArray::from(vec![Some(0), Some(1), Some(3)])
598+
.with_timezone_utc(),
599+
) as ArrayRef;
600+
let col8 = Arc::new(arrow_array::TimestampNanosecondArray::from(vec![
601+
Some(0),
602+
Some(1),
603+
Some(3),
604+
])) as ArrayRef;
605+
let col9 = Arc::new(
606+
arrow_array::TimestampNanosecondArray::from(vec![Some(0), Some(1), Some(3)])
607+
.with_timezone_utc(),
608+
) as ArrayRef;
609+
let col10 = Arc::new(arrow_array::StringArray::from(vec![
610+
Some("a"),
611+
Some("b"),
612+
Some("d"),
613+
])) as ArrayRef;
614+
let col11 = Arc::new(
615+
arrow_array::FixedSizeBinaryArray::try_from_sparse_iter_with_size(
616+
vec![
617+
Some(Uuid::from_u128(0).as_bytes().to_vec()),
618+
Some(Uuid::from_u128(1).as_bytes().to_vec()),
619+
Some(Uuid::from_u128(3).as_bytes().to_vec()),
620+
]
621+
.into_iter(),
622+
16,
623+
)
624+
.unwrap(),
625+
) as ArrayRef;
626+
let col12 = Arc::new(
627+
arrow_array::FixedSizeBinaryArray::try_from_sparse_iter_with_size(
628+
vec![
629+
Some(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
630+
Some(vec![11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
631+
Some(vec![21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
632+
]
633+
.into_iter(),
634+
10,
635+
)
636+
.unwrap(),
637+
) as ArrayRef;
638+
let col13 = Arc::new(arrow_array::LargeBinaryArray::from_opt_vec(vec![
639+
Some(b"one"),
640+
Some(b""),
641+
Some(b"zzzz"),
642+
])) as ArrayRef;
643+
let to_write = RecordBatch::try_new(delete_arrow_schema.clone(), vec![
644+
col0, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13,
645+
])
646+
.unwrap();
647+
equality_delete_writer.write(to_write.clone()).await?;
648+
let res = equality_delete_writer.close().await?;
649+
assert_eq!(res.len(), 1);
650+
check_parquet_data_file_with_equality_delete_write(
651+
&file_io,
652+
&res.into_iter().next().unwrap(),
653+
&to_write,
654+
)
655+
.await;
656+
657+
Ok(())
658+
}
503659
}

crates/iceberg/src/writer/file_writer/parquet_writer.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,17 @@ mod tests {
538538
NestedField::optional(14, "uuid", Type::Primitive(PrimitiveType::Uuid)).into(),
539539
NestedField::optional(15, "fixed", Type::Primitive(PrimitiveType::Fixed(10)))
540540
.into(),
541+
// Parquet Statistics will use different representation for Decimal with precision 38 and scale 5,
542+
// so we need to add a new field for it.
543+
NestedField::optional(
544+
16,
545+
"decimal_38",
546+
Type::Primitive(PrimitiveType::Decimal {
547+
precision: 38,
548+
scale: 5,
549+
}),
550+
)
551+
.into(),
541552
])
542553
.build()
543554
.unwrap()
@@ -1028,9 +1039,14 @@ mod tests {
10281039
)
10291040
.unwrap(),
10301041
) as ArrayRef;
1042+
let col16 = Arc::new(
1043+
arrow_array::Decimal128Array::from(vec![Some(1), Some(2), None, Some(100)])
1044+
.with_precision_and_scale(38, 5)
1045+
.unwrap(),
1046+
) as ArrayRef;
10311047
let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![
10321048
col0, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13,
1033-
col14, col15,
1049+
col14, col15, col16,
10341050
])
10351051
.unwrap();
10361052

@@ -1092,6 +1108,16 @@ mod tests {
10921108
),
10931109
(14, Datum::uuid(Uuid::from_u128(0))),
10941110
(15, Datum::fixed(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
1111+
(
1112+
16,
1113+
Datum::new(
1114+
PrimitiveType::Decimal {
1115+
precision: 38,
1116+
scale: 5
1117+
},
1118+
PrimitiveLiteral::Int128(1)
1119+
)
1120+
),
10951121
])
10961122
);
10971123
assert_eq!(
@@ -1125,6 +1151,16 @@ mod tests {
11251151
15,
11261152
Datum::fixed(vec![21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
11271153
),
1154+
(
1155+
16,
1156+
Datum::new(
1157+
PrimitiveType::Decimal {
1158+
precision: 38,
1159+
scale: 5
1160+
},
1161+
PrimitiveLiteral::Int128(100)
1162+
)
1163+
),
11281164
])
11291165
);
11301166

0 commit comments

Comments
 (0)