Skip to content

Commit fbc47c4

Browse files
committed
Iceberg schema
1 parent e37890b commit fbc47c4

File tree

9 files changed

+1173
-724
lines changed

9 files changed

+1173
-724
lines changed

Cargo.lock

Lines changed: 23 additions & 23 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ rust-version = "1.77.1"
4141
anyhow = "1.0.72"
4242
apache-avro = "0.17"
4343
array-init = "2"
44-
arrow-arith = { version = "54.1.0" }
45-
arrow-array = { version = "54.1.0" }
46-
arrow-buffer = { version = "54.1.0" }
47-
arrow-cast = { version = "54.1.0" }
48-
arrow-ord = { version = "54.1.0" }
49-
arrow-schema = { version = "54.1.0" }
50-
arrow-select = { version = "54.1.0" }
51-
arrow-string = { version = "54.1.0" }
44+
arrow-arith = { version = "54.2.1" }
45+
arrow-array = { version = "54.2.1" }
46+
arrow-buffer = { version = "54.2.1" }
47+
arrow-cast = { version = "54.2.1" }
48+
arrow-ord = { version = "54.2.1" }
49+
arrow-schema = { version = "54.2.1" }
50+
arrow-select = { version = "54.2.1" }
51+
arrow-string = { version = "54.2.1" }
5252
async-stream = "0.3.5"
5353
async-trait = "0.1.86"
5454
async-std = "1.12"
@@ -78,7 +78,7 @@ num-bigint = "0.4.6"
7878
once_cell = "1.20"
7979
opendal = "0.51.2"
8080
ordered-float = "4"
81-
parquet = "54.1.0"
81+
parquet = "54.2.1"
8282
paste = "1.0.15"
8383
pilota = "0.11.2"
8484
pretty_assertions = "1.4"

crates/iceberg/src/arrow/schema.rs

Lines changed: 0 additions & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -827,193 +827,6 @@ get_parquet_stat_as_datum!(min);
827827

828828
get_parquet_stat_as_datum!(max);
829829

830-
/// Utilities to deal with [arrow_array::builder] types in the Iceberg context.
831-
pub(crate) mod builder {
832-
use arrow_array::builder::*;
833-
use arrow_array::cast::AsArray;
834-
use arrow_array::types::*;
835-
use arrow_array::{ArrayRef, Datum as ArrowDatum};
836-
use arrow_schema::{DataType, TimeUnit};
837-
use ordered_float::OrderedFloat;
838-
839-
use crate::spec::{Literal, PrimitiveLiteral};
840-
use crate::{Error, ErrorKind};
841-
842-
/// A helper wrapping [ArrayBuilder] for building arrays without declaring the inner type at
843-
/// compile-time when types are determined dynamically (e.g. based on some column type).
844-
/// A [DataType] is given at construction time which is used to later downcast the inner array
845-
/// and provided values.
846-
pub(crate) struct AnyArrayBuilder {
847-
data_type: DataType,
848-
inner: Box<dyn ArrayBuilder>,
849-
}
850-
851-
impl AnyArrayBuilder {
852-
pub(crate) fn new(data_type: &DataType) -> Self {
853-
Self {
854-
data_type: data_type.clone(),
855-
inner: make_builder(data_type, 0),
856-
}
857-
}
858-
859-
pub(crate) fn finish(&mut self) -> ArrayRef {
860-
self.inner.finish()
861-
}
862-
863-
/// Append an [[arrow_array::Datum]] value.
864-
pub(crate) fn append_datum(&mut self, value: &dyn ArrowDatum) -> crate::Result<()> {
865-
let (array, is_scalar) = value.get();
866-
assert!(is_scalar, "Can only append scalar datum");
867-
868-
match array.data_type() {
869-
DataType::Boolean => self
870-
.builder::<BooleanBuilder>()?
871-
.append_value(array.as_boolean().value(0)),
872-
DataType::Int32 => self
873-
.builder::<Int32Builder>()?
874-
.append_value(array.as_primitive::<Int32Type>().value(0)),
875-
DataType::Int64 => self
876-
.builder::<Int64Builder>()?
877-
.append_value(array.as_primitive::<Int64Type>().value(0)),
878-
DataType::Float32 => self
879-
.builder::<Float32Builder>()?
880-
.append_value(array.as_primitive::<Float32Type>().value(0)),
881-
DataType::Float64 => self
882-
.builder::<Float64Builder>()?
883-
.append_value(array.as_primitive::<Float64Type>().value(0)),
884-
DataType::Decimal128(_, _) => self
885-
.builder::<Decimal128Builder>()?
886-
.append_value(array.as_primitive::<Decimal128Type>().value(0)),
887-
DataType::Date32 => self
888-
.builder::<Date32Builder>()?
889-
.append_value(array.as_primitive::<Date32Type>().value(0)),
890-
DataType::Time64(TimeUnit::Microsecond) => self
891-
.builder::<Time64MicrosecondBuilder>()?
892-
.append_value(array.as_primitive::<Time64MicrosecondType>().value(0)),
893-
DataType::Timestamp(TimeUnit::Microsecond, _) => self
894-
.builder::<TimestampMicrosecondBuilder>()?
895-
.append_value(array.as_primitive::<TimestampMicrosecondType>().value(0)),
896-
DataType::Timestamp(TimeUnit::Nanosecond, _) => self
897-
.builder::<TimestampNanosecondBuilder>()?
898-
.append_value(array.as_primitive::<TimestampNanosecondType>().value(0)),
899-
DataType::Utf8 => self
900-
.builder::<StringBuilder>()?
901-
.append_value(array.as_string::<i32>().value(0)),
902-
DataType::FixedSizeBinary(_) => self
903-
.builder::<BinaryBuilder>()?
904-
.append_value(array.as_fixed_size_binary().value(0)),
905-
DataType::LargeBinary => self
906-
.builder::<LargeBinaryBuilder>()?
907-
.append_value(array.as_binary::<i64>().value(0)),
908-
_ => {
909-
return Err(Error::new(
910-
ErrorKind::FeatureUnsupported,
911-
format!("Cannot append data type: {:?}", array.data_type(),),
912-
));
913-
}
914-
}
915-
Ok(())
916-
}
917-
918-
/// Append a literal with the provided [DataType]. We're not solely relying on the literal to
919-
/// infer the type because [Literal] values do not specify the expected type of builder. E.g.,
920-
/// a [PrimitiveLiteral::Long] may go into an array builder for longs but also for timestamps.
921-
pub(crate) fn append_literal(&mut self, value: &Literal) -> crate::Result<()> {
922-
let Some(primitive) = value.as_primitive_literal() else {
923-
return Err(Error::new(
924-
ErrorKind::FeatureUnsupported,
925-
"Expected primitive type",
926-
));
927-
};
928-
929-
match (&self.data_type, primitive.clone()) {
930-
(DataType::Boolean, PrimitiveLiteral::Boolean(value)) => {
931-
self.builder::<BooleanBuilder>()?.append_value(value)
932-
}
933-
(DataType::Int32, PrimitiveLiteral::Int(value)) => {
934-
self.builder::<Int32Builder>()?.append_value(value)
935-
}
936-
(DataType::Int64, PrimitiveLiteral::Long(value)) => {
937-
self.builder::<Int64Builder>()?.append_value(value)
938-
}
939-
(DataType::Float32, PrimitiveLiteral::Float(OrderedFloat(value))) => {
940-
self.builder::<Float32Builder>()?.append_value(value)
941-
}
942-
(DataType::Float64, PrimitiveLiteral::Double(OrderedFloat(value))) => {
943-
self.builder::<Float64Builder>()?.append_value(value)
944-
}
945-
(DataType::Utf8, PrimitiveLiteral::String(value)) => {
946-
self.builder::<StringBuilder>()?.append_value(value)
947-
}
948-
(DataType::FixedSizeBinary(_), PrimitiveLiteral::Binary(value)) => self
949-
.builder::<FixedSizeBinaryBuilder>()?
950-
.append_value(value)?,
951-
(DataType::LargeBinary, PrimitiveLiteral::Binary(value)) => {
952-
self.builder::<LargeBinaryBuilder>()?.append_value(value)
953-
}
954-
(_, _) => {
955-
return Err(Error::new(
956-
ErrorKind::FeatureUnsupported,
957-
format!(
958-
"Builder of type {:?} does not accept literal {:?}",
959-
self.data_type, primitive
960-
),
961-
));
962-
}
963-
}
964-
965-
Ok(())
966-
}
967-
968-
/// Append a null value for the provided [DataType].
969-
pub(crate) fn append_null(&mut self) -> crate::Result<()> {
970-
match self.data_type {
971-
DataType::Boolean => self.builder::<BooleanBuilder>()?.append_null(),
972-
DataType::Int32 => self.builder::<Int32Builder>()?.append_null(),
973-
DataType::Int64 => self.builder::<Int64Builder>()?.append_null(),
974-
DataType::Float32 => self.builder::<Float32Builder>()?.append_null(),
975-
DataType::Float64 => self.builder::<Float64Builder>()?.append_null(),
976-
DataType::Decimal128(_, _) => self.builder::<Decimal128Builder>()?.append_null(),
977-
DataType::Date32 => self.builder::<Date32Builder>()?.append_null(),
978-
DataType::Time64(TimeUnit::Microsecond) => {
979-
self.builder::<Time64MicrosecondBuilder>()?.append_null()
980-
}
981-
DataType::Timestamp(TimeUnit::Microsecond, _) => {
982-
self.builder::<TimestampMicrosecondBuilder>()?.append_null()
983-
}
984-
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
985-
self.builder::<TimestampNanosecondBuilder>()?.append_null()
986-
}
987-
DataType::Utf8 => self.builder::<StringBuilder>()?.append_null(),
988-
DataType::FixedSizeBinary(_) => {
989-
self.builder::<FixedSizeBinaryBuilder>()?.append_null()
990-
}
991-
DataType::LargeBinary => self.builder::<LargeBinaryBuilder>()?.append_null(),
992-
_ => {
993-
return Err(Error::new(
994-
ErrorKind::FeatureUnsupported,
995-
format!(
996-
"Cannot append null values for data type: {:?}",
997-
self.data_type
998-
),
999-
))
1000-
}
1001-
}
1002-
Ok(())
1003-
}
1004-
1005-
/// Cast the `inner` builder to a specific type or return [Error].
1006-
fn builder<T: ArrayBuilder>(&mut self) -> crate::Result<&mut T> {
1007-
self.inner.as_any_mut().downcast_mut::<T>().ok_or_else(|| {
1008-
Error::new(
1009-
ErrorKind::Unexpected,
1010-
"Failed to cast builder to expected type",
1011-
)
1012-
})
1013-
}
1014-
}
1015-
}
1016-
1017830
impl TryFrom<&ArrowSchema> for crate::spec::Schema {
1018831
type Error = Error;
1019832

0 commit comments

Comments
 (0)