diff --git a/.github/actions/setup-builder/action.yml b/.github/actions/setup-builder/action.yml new file mode 100644 index 000000000..43de1cbaa --- /dev/null +++ b/.github/actions/setup-builder/action.yml @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file is heavily inspired by +# [datafusion](https://github.com/apache/datafusion/blob/main/.github/actions/setup-builder/action.yaml). +name: Prepare Rust Builder +description: 'Prepare Rust Build Environment' +inputs: + rust-version: + description: 'version of rust to install (e.g. stable)' + required: true + default: 'stable' +runs: + using: "composite" + steps: + - name: Setup Rust toolchain + shell: bash + run: | + echo "Installing ${{ inputs.rust-version }}" + rustup toolchain install ${{ inputs.rust-version }} + rustup default ${{ inputs.rust-version }} + rustup component add rustfmt clippy + - name: Fixup git permissions + # https://github.com/actions/checkout/issues/766 + shell: bash + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09f231735..f98f5e75b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true +env: + rust_msrv: "1.77.1" + jobs: check: runs-on: ubuntu-latest @@ -38,6 +41,12 @@ jobs: - name: Check License Header uses: apache/skywalking-eyes/header@v0.6.0 + - name: Install cargo-sort + run: make install-cargo-sort + + - name: Install taplo-cli + run: make install-taplo-cli + - name: Cargo format run: make check-fmt @@ -63,8 +72,14 @@ jobs: - windows-latest steps: - uses: actions/checkout@v4 + + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.rust_msrv }} + - name: Build - run: cargo build + run: make build build_with_no_default_features: runs-on: ${{ matrix.os }} @@ -84,6 +99,11 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.rust_msrv }} + - name: Test run: cargo test --no-fail-fast --all-targets --all-features --workspace diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 9289d3e10..a57aa612f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,13 +21,11 @@ on: push: tags: - '*' - pull_request: - branches: - - main - paths: - - ".github/workflows/publish.yml" workflow_dispatch: +env: + rust_msrv: "1.77.1" + jobs: publish: runs-on: ubuntu-latest @@ -42,9 +40,11 @@ jobs: - "crates/catalog/rest" steps: - uses: actions/checkout@v4 - - name: Dryrun ${{ matrix.package }} - working-directory: ${{ matrix.package }} - run: cargo publish --all-features --dry-run + + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.rust_msrv }} - name: Publish ${{ matrix.package }} working-directory: ${{ matrix.package }} diff --git a/Makefile b/Makefile index abee425d7..ed10a8acd 100644 --- a/Makefile +++ b/Makefile @@ -17,34 +17,37 @@ .EXPORT_ALL_VARIABLES: -RUST_LOG = debug - build: - cargo build + cargo build --all-targets --all-features --workspace check-fmt: - cargo fmt --all -- --check + cargo fmt --all -- --check check-clippy: - cargo clippy --all-targets --all-features --workspace -- -D warnings + cargo clippy --all-targets --all-features --workspace -- -D warnings + +install-cargo-sort: + cargo install cargo-sort@1.0.9 -cargo-sort: - cargo install cargo-sort +cargo-sort: install-cargo-sort cargo sort -c -w -cargo-machete: +install-cargo-machete: cargo install cargo-machete + +cargo-machete: install-cargo-machete cargo machete -fix-toml: - cargo install taplo-cli --locked +install-taplo-cli: + cargo install taplo-cli@0.9.0 + +fix-toml: install-taplo-cli taplo fmt -check-toml: - cargo install taplo-cli --locked +check-toml: install-taplo-cli taplo check -check: check-fmt check-clippy cargo-sort check-toml +check: check-fmt check-clippy cargo-sort check-toml cargo-machete doc-test: cargo test --no-fail-fast --doc --all-features --workspace diff --git a/README.md b/README.md index 3f4f7a334..4f8265b79 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,15 @@ The Apache Iceberg Rust project is composed of the following components: [iceberg-catalog-rest link]: https://crates.io/crates/iceberg-catalog-rest [iceberg-catalog-rest release docs]: https://docs.rs/iceberg-catalog-rest +## Supported Rust Version + +Iceberg Rust is built and tested with stable rust, and will keep a rolling MSRV(minimum supported rust version). The +current MSRV is 1.77.1. + +Also, we use unstable rust to run linters, such as `clippy` and `rustfmt`. But this will not affect downstream users, +and only MSRV is required. + + ## Contribute Apache Iceberg is an active open-source project, governed under the Apache Software Foundation (ASF). We are always open to people who want to use or contribute to it. Here are some ways to get involved. diff --git a/crates/iceberg/src/avro/schema.rs b/crates/iceberg/src/avro/schema.rs index d83cdc36b..11d000cc5 100644 --- a/crates/iceberg/src/avro/schema.rs +++ b/crates/iceberg/src/avro/schema.rs @@ -19,10 +19,10 @@ use std::collections::BTreeMap; use crate::spec::{ - visit_schema, ListType, MapType, NestedField, NestedFieldRef, PrimitiveType, Schema, - SchemaVisitor, StructType, Type, + visit_schema, ListType, MapType, NestedFieldRef, PrimitiveType, Schema, SchemaVisitor, + StructType, }; -use crate::{ensure_data_valid, Error, ErrorKind, Result}; +use crate::{Error, ErrorKind, Result}; use apache_avro::schema::{ DecimalSchema, FixedSchema, Name, RecordField as AvroRecordField, RecordFieldOrder, RecordSchema, UnionSchema, @@ -272,205 +272,205 @@ fn avro_optional(avro_schema: AvroSchema) -> Result { ])?)) } -fn is_avro_optional(avro_schema: &AvroSchema) -> bool { - match avro_schema { - AvroSchema::Union(union) => union.is_nullable(), - _ => false, +#[cfg(test)] +mod tests { + use super::*; + use crate::ensure_data_valid; + use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; + use apache_avro::schema::{Namespace, UnionSchema}; + use apache_avro::Schema as AvroSchema; + use std::fs::read_to_string; + + fn is_avro_optional(avro_schema: &AvroSchema) -> bool { + match avro_schema { + AvroSchema::Union(union) => union.is_nullable(), + _ => false, + } } -} -/// Post order avro schema visitor. -pub(crate) trait AvroSchemaVisitor { - type T; + /// Post order avro schema visitor. + pub(crate) trait AvroSchemaVisitor { + type T; - fn record(&mut self, record: &RecordSchema, fields: Vec) -> Result; + fn record(&mut self, record: &RecordSchema, fields: Vec) -> Result; - fn union(&mut self, union: &UnionSchema, options: Vec) -> Result; + fn union(&mut self, union: &UnionSchema, options: Vec) -> Result; - fn array(&mut self, array: &AvroSchema, item: Self::T) -> Result; - fn map(&mut self, map: &AvroSchema, value: Self::T) -> Result; + fn array(&mut self, array: &AvroSchema, item: Self::T) -> Result; + fn map(&mut self, map: &AvroSchema, value: Self::T) -> Result; - fn primitive(&mut self, schema: &AvroSchema) -> Result; -} + fn primitive(&mut self, schema: &AvroSchema) -> Result; + } -struct AvroSchemaToSchema { - next_id: i32, -} + struct AvroSchemaToSchema { + next_id: i32, + } -impl AvroSchemaToSchema { - fn next_field_id(&mut self) -> i32 { - self.next_id += 1; - self.next_id + impl AvroSchemaToSchema { + fn next_field_id(&mut self) -> i32 { + self.next_id += 1; + self.next_id + } } -} -impl AvroSchemaVisitor for AvroSchemaToSchema { - // Only `AvroSchema::Null` will return `None` - type T = Option; + impl AvroSchemaVisitor for AvroSchemaToSchema { + // Only `AvroSchema::Null` will return `None` + type T = Option; + + fn record( + &mut self, + record: &RecordSchema, + field_types: Vec>, + ) -> Result> { + let mut fields = Vec::with_capacity(field_types.len()); + for (avro_field, typ) in record.fields.iter().zip_eq(field_types) { + let field_id = avro_field + .custom_attributes + .get(FILED_ID_PROP) + .and_then(Value::as_i64) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Can't convert field, missing field id: {avro_field:?}"), + ) + })?; - fn record( - &mut self, - record: &RecordSchema, - field_types: Vec>, - ) -> Result> { - let mut fields = Vec::with_capacity(field_types.len()); - for (avro_field, typ) in record.fields.iter().zip_eq(field_types) { - let field_id = avro_field - .custom_attributes - .get(FILED_ID_PROP) - .and_then(Value::as_i64) - .ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - format!("Can't convert field, missing field id: {avro_field:?}"), - ) - })?; + let optional = is_avro_optional(&avro_field.schema); - let optional = is_avro_optional(&avro_field.schema); + let mut field = if optional { + NestedField::optional(field_id as i32, &avro_field.name, typ.unwrap()) + } else { + NestedField::required(field_id as i32, &avro_field.name, typ.unwrap()) + }; - let mut field = if optional { - NestedField::optional(field_id as i32, &avro_field.name, typ.unwrap()) - } else { - NestedField::required(field_id as i32, &avro_field.name, typ.unwrap()) - }; + if let Some(doc) = &avro_field.doc { + field = field.with_doc(doc); + } - if let Some(doc) = &avro_field.doc { - field = field.with_doc(doc); + fields.push(field.into()); } - fields.push(field.into()); + Ok(Some(Type::Struct(StructType::new(fields)))) } - Ok(Some(Type::Struct(StructType::new(fields)))) - } - - fn union( - &mut self, - union: &UnionSchema, - mut options: Vec>, - ) -> Result> { - ensure_data_valid!( - options.len() <= 2 && !options.is_empty(), - "Can't convert avro union type {:?} to iceberg.", - union - ); - - if options.len() > 1 { + fn union( + &mut self, + union: &UnionSchema, + mut options: Vec>, + ) -> Result> { ensure_data_valid!( - options[0].is_none(), + options.len() <= 2 && !options.is_empty(), "Can't convert avro union type {:?} to iceberg.", union ); - } - if options.len() == 1 { - Ok(Some(options.remove(0).unwrap())) - } else { - Ok(Some(options.remove(1).unwrap())) - } - } + if options.len() > 1 { + ensure_data_valid!( + options[0].is_none(), + "Can't convert avro union type {:?} to iceberg.", + union + ); + } - fn array(&mut self, array: &AvroSchema, item: Option) -> Result { - if let AvroSchema::Array(item_schema) = array { - let element_field = NestedField::list_element( - self.next_field_id(), - item.unwrap(), - !is_avro_optional(item_schema), - ) - .into(); - Ok(Some(Type::List(ListType { element_field }))) - } else { - Err(Error::new( - ErrorKind::Unexpected, - "Expected avro array schema, but {array}", - )) + if options.len() == 1 { + Ok(Some(options.remove(0).unwrap())) + } else { + Ok(Some(options.remove(1).unwrap())) + } } - } - fn map(&mut self, map: &AvroSchema, value: Option) -> Result> { - if let AvroSchema::Map(value_schema) = map { - // Due to avro rust implementation's limitation, we can't store attributes in map schema, - // we will fix it later when it has been resolved. - let key_field = NestedField::map_key_element( - self.next_field_id(), - Type::Primitive(PrimitiveType::String), - ); - let value_field = NestedField::map_value_element( - self.next_field_id(), - value.unwrap(), - !is_avro_optional(value_schema), - ); - Ok(Some(Type::Map(MapType { - key_field: key_field.into(), - value_field: value_field.into(), - }))) - } else { - Err(Error::new( - ErrorKind::Unexpected, - "Expected avro map schema, but {map}", - )) + fn array(&mut self, array: &AvroSchema, item: Option) -> Result { + if let AvroSchema::Array(item_schema) = array { + let element_field = NestedField::list_element( + self.next_field_id(), + item.unwrap(), + !is_avro_optional(item_schema), + ) + .into(); + Ok(Some(Type::List(ListType { element_field }))) + } else { + Err(Error::new( + ErrorKind::Unexpected, + "Expected avro array schema, but {array}", + )) + } } - } - fn primitive(&mut self, schema: &AvroSchema) -> Result> { - let typ = match schema { - AvroSchema::Decimal(decimal) => { - Type::decimal(decimal.precision as u32, decimal.scale as u32)? + fn map(&mut self, map: &AvroSchema, value: Option) -> Result> { + if let AvroSchema::Map(value_schema) = map { + // Due to avro rust implementation's limitation, we can't store attributes in map schema, + // we will fix it later when it has been resolved. + let key_field = NestedField::map_key_element( + self.next_field_id(), + Type::Primitive(PrimitiveType::String), + ); + let value_field = NestedField::map_value_element( + self.next_field_id(), + value.unwrap(), + !is_avro_optional(value_schema), + ); + Ok(Some(Type::Map(MapType { + key_field: key_field.into(), + value_field: value_field.into(), + }))) + } else { + Err(Error::new( + ErrorKind::Unexpected, + "Expected avro map schema, but {map}", + )) } - AvroSchema::Date => Type::Primitive(PrimitiveType::Date), - AvroSchema::TimeMicros => Type::Primitive(PrimitiveType::Time), - AvroSchema::TimestampMicros => Type::Primitive(PrimitiveType::Timestamp), - AvroSchema::Boolean => Type::Primitive(PrimitiveType::Boolean), - AvroSchema::Int => Type::Primitive(PrimitiveType::Int), - AvroSchema::Long => Type::Primitive(PrimitiveType::Long), - AvroSchema::Float => Type::Primitive(PrimitiveType::Float), - AvroSchema::Double => Type::Primitive(PrimitiveType::Double), - AvroSchema::String | AvroSchema::Enum(_) => Type::Primitive(PrimitiveType::String), - AvroSchema::Fixed(fixed) => { - if let Some(logical_type) = fixed.attributes.get(LOGICAL_TYPE) { - let logical_type = logical_type.as_str().ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - "logicalType in attributes of avro schema is not a string type", - ) - })?; - match logical_type { - UUID_LOGICAL_TYPE => Type::Primitive(PrimitiveType::Uuid), - ty => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( + } + + fn primitive(&mut self, schema: &AvroSchema) -> Result> { + let typ = match schema { + AvroSchema::Decimal(decimal) => { + Type::decimal(decimal.precision as u32, decimal.scale as u32)? + } + AvroSchema::Date => Type::Primitive(PrimitiveType::Date), + AvroSchema::TimeMicros => Type::Primitive(PrimitiveType::Time), + AvroSchema::TimestampMicros => Type::Primitive(PrimitiveType::Timestamp), + AvroSchema::Boolean => Type::Primitive(PrimitiveType::Boolean), + AvroSchema::Int => Type::Primitive(PrimitiveType::Int), + AvroSchema::Long => Type::Primitive(PrimitiveType::Long), + AvroSchema::Float => Type::Primitive(PrimitiveType::Float), + AvroSchema::Double => Type::Primitive(PrimitiveType::Double), + AvroSchema::String | AvroSchema::Enum(_) => Type::Primitive(PrimitiveType::String), + AvroSchema::Fixed(fixed) => { + if let Some(logical_type) = fixed.attributes.get(LOGICAL_TYPE) { + let logical_type = logical_type.as_str().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "logicalType in attributes of avro schema is not a string type", + ) + })?; + match logical_type { + UUID_LOGICAL_TYPE => Type::Primitive(PrimitiveType::Uuid), + ty => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( "Logical type {ty} is not support in iceberg primitive type.", ), - )) + )) + } } + } else { + Type::Primitive(PrimitiveType::Fixed(fixed.size as u64)) } - } else { - Type::Primitive(PrimitiveType::Fixed(fixed.size as u64)) } - } - AvroSchema::Bytes => Type::Primitive(PrimitiveType::Binary), - AvroSchema::Null => return Ok(None), - _ => { - return Err(Error::new( - ErrorKind::Unexpected, - "Unable to convert avro {schema} to iceberg primitive type.", - )) - } - }; + AvroSchema::Bytes => Type::Primitive(PrimitiveType::Binary), + AvroSchema::Null => return Ok(None), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + "Unable to convert avro {schema} to iceberg primitive type.", + )) + } + }; - Ok(Some(typ)) + Ok(Some(typ)) + } } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::avro::schema::AvroSchemaToSchema; - use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; - use apache_avro::schema::{Namespace, UnionSchema}; - use apache_avro::Schema as AvroSchema; - use std::fs::read_to_string; /// Visit avro schema in post order visitor. pub(crate) fn visit( diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index 43e49fc8b..5f50e417d 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -627,9 +627,6 @@ mod tests { use crate::writer::file_writer::location_generator::DefaultFileNameGenerator; use crate::writer::tests::check_parquet_data_file; - #[derive(Clone)] - struct TestLocationGen; - fn schema_for_all_type() -> Schema { Schema::builder() .with_schema_id(1) diff --git a/crates/integrations/datafusion/src/schema.rs b/crates/integrations/datafusion/src/schema.rs index 2ba69621a..f7b1a21d2 100644 --- a/crates/integrations/datafusion/src/schema.rs +++ b/crates/integrations/datafusion/src/schema.rs @@ -89,7 +89,7 @@ impl SchemaProvider for IcebergSchemaProvider { } fn table_exist(&self, name: &str) -> bool { - self.tables.get(name).is_some() + self.tables.contains_key(name) } async fn table(&self, name: &str) -> DFResult>> { diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 6685489a6..7b10a8692 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -16,5 +16,5 @@ # under the License. [toolchain] -channel = "1.77.1" +channel = "nightly-2024-06-10" components = ["rustfmt", "clippy"]